IMPORTAMOS LAS LIBRERÍAS

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px
from sklearn.impute import KNNImputer
import scipy.stats as ss
import warnings
pd.set_option('display.max_columns', 10000)
pd.set_option('display.max_rows', 10000)

FUNCIONES

In [2]:
def plot_feature(df, col_name, isContinuous, target):
    f, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12,3), dpi=90)
    
    count_null = df[col_name].isnull().sum()
    if isContinuous:
        
        sns.histplot(df.loc[df[col_name].notnull(), col_name], kde=False, ax=ax1)
    else:
        sns.countplot(df[col_name], order=sorted(df[col_name].unique()), color='#5975A4', saturation=1, ax=ax1)
    ax1.set_xlabel(col_name)
    ax1.set_ylabel('Count')
    ax1.set_title(col_name+ ' Numero de nulos: '+str(count_null))
    plt.xticks(rotation = 90)


    if isContinuous:
        sns.boxplot(x=col_name, y=target, data=df, ax=ax2)
        ax2.set_ylabel('')
        ax2.set_title(col_name + ' by '+target)
    else:
        data = df.groupby(col_name)[target].value_counts(normalize=True).to_frame('proportion').reset_index() 
        data.columns = [i, target, 'proportion']
        #sns.barplot(x = col_name, y = 'proportion', hue= target, data = data, saturation=1, ax=ax2)
        sns.barplot(x = col_name, y = 'proportion', hue= target, data = data, saturation=1, ax=ax2)
        ax2.set_ylabel(target+' fraction')
        ax2.set_title(target)
        plt.xticks(rotation = 90)
    ax2.set_xlabel(col_name)
    
    plt.tight_layout()

def dame_variables_categoricas(dataset=None):
    if dataset is None:
        print(u'\nFaltan argumentos por pasar a la función')
        return 1
    lista_variables_categoricas = []
    other = []

    for i in dataset.columns:

        if dataset[i].dtype == object:
            unicos = int(len(np.unique(dataset[i].dropna(axis=0, how='all'))))
            if unicos < 100:
                lista_variables_categoricas.append(i)
            else:
                other.append(i)
        
        if dataset[i].dtype == int:
            unicos = int(len(np.unique(dataset[i].dropna(axis=0, how='all'))))
            if unicos < 10:
                lista_variables_categoricas.append(i)
            else:
                other.append(i)

    return lista_variables_categoricas, other

def get_corr_matrix(dataset = None, metodo='pearson', size_figure=[10,8]):
    # Para obtener la correlación de Spearman, sólo cambiar el metodo por 'spearman'

    if dataset is None:
        print(u'\nHace falta pasar argumentos a la función')
        return 1
    sns.set(style="white")
    # Compute the correlation matrix
    corr = dataset.corr(method=metodo) 
    # Set self-correlation to zero to avoid distraction
    for i in range(corr.shape[0]):
        corr.iloc[i, i] = 0
    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=size_figure)
    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(corr, center=0,
                square=True, linewidths=.5,  cmap ='viridis' ) #cbar_kws={"shrink": .5}
    plt.show()
    
    return 0

def get_deviation_of_mean_perc(pd_loan, list_var_continuous, target, multiplier):
    pd_final = pd.DataFrame()
    
    for i in list_var_continuous:
        
        series_mean = pd_loan[i].mean()
        series_std = pd_loan[i].std()
        std_amp = multiplier * series_std
        left = series_mean - std_amp
        right = series_mean + std_amp
        size_s = pd_loan[i].size
        
        perc_goods = pd_loan[i][(pd_loan[i] >= left) & (pd_loan[i] <= right)].size/size_s
        perc_excess = pd_loan[i][(pd_loan[i] < left) | (pd_loan[i] > right)].size/size_s
        
        if perc_excess>0:    
            pd_concat_percent = pd.DataFrame(pd_loan[target][(pd_loan[i] < left) | (pd_loan[i] > right)]\
                                            .value_counts(normalize=True).reset_index()).T
            pd_concat_percent.columns = [pd_concat_percent.iloc[0,0], 
                                         pd_concat_percent.iloc[0,1]]
            pd_concat_percent = pd_concat_percent.drop('TARGET',axis=0)
            pd_concat_percent['variable'] = i
            pd_concat_percent['sum_outlier_values'] = pd_loan[i][(pd_loan[i] < left) | (pd_loan[i] > right)].size
            pd_concat_percent['porcentaje_sum_null_values'] = perc_excess
            pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)
            
    if pd_final.empty:
        print('No existen variables con valores nulos')
        
    return pd_final

def get_percent_null_values_target(pd_loan, list_var_continuous, target):

    pd_final = pd.DataFrame()
    for i in list_var_continuous:
        if pd_loan[i].isnull().sum()>0:
            pd_concat_percent = pd.DataFrame(pd_loan[target][pd_loan[i].isnull()]\
                                            .value_counts(normalize=True).reset_index()).T
            pd_concat_percent.columns = [pd_concat_percent.iloc[0,0], 
                                         pd_concat_percent.iloc[0,1]]
            pd_concat_percent = pd_concat_percent.drop('TARGET',axis=0)
            pd_concat_percent['variable'] = i
            pd_concat_percent['sum_null_values'] = pd_loan[i].isnull().sum()
            pd_concat_percent['porcentaje_sum_null_values'] = pd_loan[i].isnull().sum()/pd_loan.shape[0]
            pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)
            
    if pd_final.empty:
        print('No existen variables con valores nulos')
        
    return pd_final

def cramers_v(confusion_matrix):
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

Con estas funciones estamos tratando de hacer varias cosas:

  1. plot_feature: va a generar gráficos específicos para cada variable donde hará un histograma para analizar la distribución y boxplot para ver cómo se distribuyen los valores en base a la variable TARGET para las variables contínuas. Para las variables categóricas va a crear dos gráficos de barras, para la frecuencia de la variable y otro con las proporciones por categoría de TARGET

  2. dame_variables_categoricas: trata de identificar variables categóricas y separarlas en dos: aquellas variables que tienen menos de 100 valores únicos (estas pueden ser muy útiles en nuestro análisis), y aquellas que tienen más de 100 registros únicos

  3. get_corr_matrix: para visualizar la matriz de correlación para variables numéricas

  4. get_deviation_of_mean_perc: crea un rango de valores alrededor de la media con la desviación típica y encuentra la proporción de valores dentro y fuera de este intervalo, para así poder ver los valores atípicos y la relación que tienen con la variable TARGET.

  5. get_percent_values_target: es para analizar la relación que hay entre valores nulos y la variable TARGET.

  6. cramers_v: para ver la relación entre variables categóricas usando la estadística de Cramer

LECTURA DE DATOS DEL PREPROCESADO INICIAL

In [6]:
path_folder = "../data/"
pd_loan = pd.read_csv(path_folder + "application_data.csv", low_memory = False).set_index('SK_ID_CURR')

pd_loan.head()
Out[6]:
TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY AMT_GOODS_PRICE NAME_TYPE_SUITE NAME_INCOME_TYPE NAME_EDUCATION_TYPE NAME_FAMILY_STATUS NAME_HOUSING_TYPE REGION_POPULATION_RELATIVE DAYS_BIRTH DAYS_EMPLOYED DAYS_REGISTRATION DAYS_ID_PUBLISH OWN_CAR_AGE FLAG_MOBIL FLAG_EMP_PHONE FLAG_WORK_PHONE FLAG_CONT_MOBILE FLAG_PHONE FLAG_EMAIL OCCUPATION_TYPE CNT_FAM_MEMBERS REGION_RATING_CLIENT REGION_RATING_CLIENT_W_CITY WEEKDAY_APPR_PROCESS_START HOUR_APPR_PROCESS_START REG_REGION_NOT_LIVE_REGION REG_REGION_NOT_WORK_REGION LIVE_REGION_NOT_WORK_REGION REG_CITY_NOT_LIVE_CITY REG_CITY_NOT_WORK_CITY LIVE_CITY_NOT_WORK_CITY ORGANIZATION_TYPE EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3 APARTMENTS_AVG BASEMENTAREA_AVG YEARS_BEGINEXPLUATATION_AVG YEARS_BUILD_AVG COMMONAREA_AVG ELEVATORS_AVG ENTRANCES_AVG FLOORSMAX_AVG FLOORSMIN_AVG LANDAREA_AVG LIVINGAPARTMENTS_AVG LIVINGAREA_AVG NONLIVINGAPARTMENTS_AVG NONLIVINGAREA_AVG APARTMENTS_MODE BASEMENTAREA_MODE YEARS_BEGINEXPLUATATION_MODE YEARS_BUILD_MODE COMMONAREA_MODE ELEVATORS_MODE ENTRANCES_MODE FLOORSMAX_MODE FLOORSMIN_MODE LANDAREA_MODE LIVINGAPARTMENTS_MODE LIVINGAREA_MODE NONLIVINGAPARTMENTS_MODE NONLIVINGAREA_MODE APARTMENTS_MEDI BASEMENTAREA_MEDI YEARS_BEGINEXPLUATATION_MEDI YEARS_BUILD_MEDI COMMONAREA_MEDI ELEVATORS_MEDI ENTRANCES_MEDI FLOORSMAX_MEDI FLOORSMIN_MEDI LANDAREA_MEDI LIVINGAPARTMENTS_MEDI LIVINGAREA_MEDI NONLIVINGAPARTMENTS_MEDI NONLIVINGAREA_MEDI FONDKAPREMONT_MODE HOUSETYPE_MODE TOTALAREA_MODE WALLSMATERIAL_MODE EMERGENCYSTATE_MODE OBS_30_CNT_SOCIAL_CIRCLE DEF_30_CNT_SOCIAL_CIRCLE OBS_60_CNT_SOCIAL_CIRCLE DEF_60_CNT_SOCIAL_CIRCLE DAYS_LAST_PHONE_CHANGE FLAG_DOCUMENT_2 FLAG_DOCUMENT_3 FLAG_DOCUMENT_4 FLAG_DOCUMENT_5 FLAG_DOCUMENT_6 FLAG_DOCUMENT_7 FLAG_DOCUMENT_8 FLAG_DOCUMENT_9 FLAG_DOCUMENT_10 FLAG_DOCUMENT_11 FLAG_DOCUMENT_12 FLAG_DOCUMENT_13 FLAG_DOCUMENT_14 FLAG_DOCUMENT_15 FLAG_DOCUMENT_16 FLAG_DOCUMENT_17 FLAG_DOCUMENT_18 FLAG_DOCUMENT_19 FLAG_DOCUMENT_20 FLAG_DOCUMENT_21 AMT_REQ_CREDIT_BUREAU_HOUR AMT_REQ_CREDIT_BUREAU_DAY AMT_REQ_CREDIT_BUREAU_WEEK AMT_REQ_CREDIT_BUREAU_MON AMT_REQ_CREDIT_BUREAU_QRT AMT_REQ_CREDIT_BUREAU_YEAR
SK_ID_CURR
100002 1 Cash loans M N Y 0 202500.0 406597.5 24700.5 351000.0 Unaccompanied Working Secondary / secondary special Single / not married House / apartment 0.018801 -9461 -637 -3648.0 -2120 NaN 1 1 0 1 1 0 Laborers 1.0 2 2 WEDNESDAY 10 0 0 0 0 0 0 Business Entity Type 3 0.083037 0.262949 0.139376 0.0247 0.0369 0.9722 0.6192 0.0143 0.00 0.0690 0.0833 0.1250 0.0369 0.0202 0.0190 0.0000 0.0000 0.0252 0.0383 0.9722 0.6341 0.0144 0.0000 0.0690 0.0833 0.1250 0.0377 0.022 0.0198 0.0 0.0 0.0250 0.0369 0.9722 0.6243 0.0144 0.00 0.0690 0.0833 0.1250 0.0375 0.0205 0.0193 0.0000 0.00 reg oper account block of flats 0.0149 Stone, brick No 2.0 2.0 2.0 2.0 -1134.0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 1.0
100003 0 Cash loans F N N 0 270000.0 1293502.5 35698.5 1129500.0 Family State servant Higher education Married House / apartment 0.003541 -16765 -1188 -1186.0 -291 NaN 1 1 0 1 1 0 Core staff 2.0 1 1 MONDAY 11 0 0 0 0 0 0 School 0.311267 0.622246 NaN 0.0959 0.0529 0.9851 0.7960 0.0605 0.08 0.0345 0.2917 0.3333 0.0130 0.0773 0.0549 0.0039 0.0098 0.0924 0.0538 0.9851 0.8040 0.0497 0.0806 0.0345 0.2917 0.3333 0.0128 0.079 0.0554 0.0 0.0 0.0968 0.0529 0.9851 0.7987 0.0608 0.08 0.0345 0.2917 0.3333 0.0132 0.0787 0.0558 0.0039 0.01 reg oper account block of flats 0.0714 Block No 1.0 0.0 1.0 0.0 -828.0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0
100004 0 Revolving loans M Y Y 0 67500.0 135000.0 6750.0 135000.0 Unaccompanied Working Secondary / secondary special Single / not married House / apartment 0.010032 -19046 -225 -4260.0 -2531 26.0 1 1 1 1 1 0 Laborers 1.0 2 2 MONDAY 9 0 0 0 0 0 0 Government NaN 0.555912 0.729567 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0 0.0 0.0 0.0 -815.0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0
100006 0 Cash loans F N Y 0 135000.0 312682.5 29686.5 297000.0 Unaccompanied Working Secondary / secondary special Civil marriage House / apartment 0.008019 -19005 -3039 -9833.0 -2437 NaN 1 1 0 1 0 0 Laborers 2.0 2 2 WEDNESDAY 17 0 0 0 0 0 0 Business Entity Type 3 NaN 0.650442 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.0 0.0 2.0 0.0 -617.0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 NaN NaN NaN NaN NaN NaN
100007 0 Cash loans M N Y 0 121500.0 513000.0 21865.5 513000.0 Unaccompanied Working Secondary / secondary special Single / not married House / apartment 0.028663 -19932 -3038 -4311.0 -3458 NaN 1 1 0 1 0 0 Core staff 1.0 2 2 THURSDAY 11 0 0 0 0 1 1 Religion NaN 0.322738 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0 0.0 0.0 0.0 -1106.0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0

Utilizamos el ID de cliente como el índice del Dataframe. A continuación, identificamos las variables continuas y categóricas dentro del conjunto de datos

In [4]:
def dame_variables_categoricas(dataset=None):
    '''
    ----------------------------------------------------------------------------------------------------------
    Función dame_variables_categoricas:
    ----------------------------------------------------------------------------------------------------------
    - Descripción:
        Esta función recibe un DataFrame una lista de las variables categóricas 
        (con pocos valores únicos).
        
    - Inputs: 
        - dataset (DataFrame): DataFrame que contiene los datos de entrada.
        
    - Return:
        - lista_variables_categoricas (list): Lista con los nombres de las variables 
          categóricas en el DataFrame.
        - other (list): Lista con los nombres de las variables que no cumplen los criterios 
          para ser categóricas.
        - 1: Indica que la ejecución es incorrecta debido a la falta del 
          argumento 'dataset'.
    '''
    # Verificar que el DataFrame de entrada no sea nulo
    if dataset is None:
        print(u'\nError: Falta el argumento dataset en la función')
        return 1 
    
    lista_variables_categoricas = []  
    other = []  

    # Recorrer las columnas del DataFrame
    for i in dataset.columns:
        
        # Si la columna es de tipo objeto (posiblemente categórica)
        if dataset[i].dtype == object:
            unicos = int(len(np.unique(dataset[i].dropna(axis=0, how='all'))))
            if unicos < 100:
                lista_variables_categoricas.append(i)  
            else:
                other.append(i)  
                
        # Si la columna es de tipo entero                
        if dataset[i].dtype == int:
            unicos = int(len(np.unique(dataset[i].dropna(axis=0, how='all'))))
            if unicos < 10:
                lista_variables_categoricas.append(i)  
            else:
                other.append(i) 

    return lista_variables_categoricas, other
In [7]:
import numpy as np
#Aquí identificamos las variables categóricas
list_var_cat, other = dame_variables_categoricas(dataset=pd_loan)
pd_loan[list_var_cat] = pd_loan[list_var_cat].astype("category")

#Seleccionamos las columnas que tienen datos numéricos continuos
list_var_continuous = list(pd_loan.select_dtypes('float').columns)
pd_loan[list_var_continuous] = pd_loan[list_var_continuous].astype(float)
pd_loan.dtypes

pd_loan.dtypes.to_dict()
Out[7]:
{'TARGET': dtype('int64'),
 'NAME_CONTRACT_TYPE': CategoricalDtype(categories=['Cash loans', 'Revolving loans'], ordered=False, categories_dtype=object),
 'CODE_GENDER': CategoricalDtype(categories=['F', 'M', 'XNA'], ordered=False, categories_dtype=object),
 'FLAG_OWN_CAR': CategoricalDtype(categories=['N', 'Y'], ordered=False, categories_dtype=object),
 'FLAG_OWN_REALTY': CategoricalDtype(categories=['N', 'Y'], ordered=False, categories_dtype=object),
 'CNT_CHILDREN': dtype('int64'),
 'AMT_INCOME_TOTAL': dtype('float64'),
 'AMT_CREDIT': dtype('float64'),
 'AMT_ANNUITY': dtype('float64'),
 'AMT_GOODS_PRICE': dtype('float64'),
 'NAME_TYPE_SUITE': CategoricalDtype(categories=['Children', 'Family', 'Group of people', 'Other_A',
                   'Other_B', 'Spouse, partner', 'Unaccompanied'],
 , ordered=False, categories_dtype=object),
 'NAME_INCOME_TYPE': CategoricalDtype(categories=['Businessman', 'Commercial associate', 'Maternity leave',
                   'Pensioner', 'State servant', 'Student', 'Unemployed',
                   'Working'],
 , ordered=False, categories_dtype=object),
 'NAME_EDUCATION_TYPE': CategoricalDtype(categories=['Academic degree', 'Higher education', 'Incomplete higher',
                   'Lower secondary', 'Secondary / secondary special'],
 , ordered=False, categories_dtype=object),
 'NAME_FAMILY_STATUS': CategoricalDtype(categories=['Civil marriage', 'Married', 'Separated',
                   'Single / not married', 'Unknown', 'Widow'],
 , ordered=False, categories_dtype=object),
 'NAME_HOUSING_TYPE': CategoricalDtype(categories=['Co-op apartment', 'House / apartment',
                   'Municipal apartment', 'Office apartment',
                   'Rented apartment', 'With parents'],
 , ordered=False, categories_dtype=object),
 'REGION_POPULATION_RELATIVE': dtype('float64'),
 'DAYS_BIRTH': dtype('int64'),
 'DAYS_EMPLOYED': dtype('int64'),
 'DAYS_REGISTRATION': dtype('float64'),
 'DAYS_ID_PUBLISH': dtype('int64'),
 'OWN_CAR_AGE': dtype('float64'),
 'FLAG_MOBIL': dtype('int64'),
 'FLAG_EMP_PHONE': dtype('int64'),
 'FLAG_WORK_PHONE': dtype('int64'),
 'FLAG_CONT_MOBILE': dtype('int64'),
 'FLAG_PHONE': dtype('int64'),
 'FLAG_EMAIL': dtype('int64'),
 'OCCUPATION_TYPE': CategoricalDtype(categories=['Accountants', 'Cleaning staff', 'Cooking staff',
                   'Core staff', 'Drivers', 'HR staff', 'High skill tech staff',
                   'IT staff', 'Laborers', 'Low-skill Laborers', 'Managers',
                   'Medicine staff', 'Private service staff', 'Realty agents',
                   'Sales staff', 'Secretaries', 'Security staff',
                   'Waiters/barmen staff'],
 , ordered=False, categories_dtype=object),
 'CNT_FAM_MEMBERS': dtype('float64'),
 'REGION_RATING_CLIENT': dtype('int64'),
 'REGION_RATING_CLIENT_W_CITY': dtype('int64'),
 'WEEKDAY_APPR_PROCESS_START': CategoricalDtype(categories=['FRIDAY', 'MONDAY', 'SATURDAY', 'SUNDAY', 'THURSDAY',
                   'TUESDAY', 'WEDNESDAY'],
 , ordered=False, categories_dtype=object),
 'HOUR_APPR_PROCESS_START': dtype('int64'),
 'REG_REGION_NOT_LIVE_REGION': dtype('int64'),
 'REG_REGION_NOT_WORK_REGION': dtype('int64'),
 'LIVE_REGION_NOT_WORK_REGION': dtype('int64'),
 'REG_CITY_NOT_LIVE_CITY': dtype('int64'),
 'REG_CITY_NOT_WORK_CITY': dtype('int64'),
 'LIVE_CITY_NOT_WORK_CITY': dtype('int64'),
 'ORGANIZATION_TYPE': CategoricalDtype(categories=['Advertising', 'Agriculture', 'Bank',
                   'Business Entity Type 1', 'Business Entity Type 2',
                   'Business Entity Type 3', 'Cleaning', 'Construction',
                   'Culture', 'Electricity', 'Emergency', 'Government', 'Hotel',
                   'Housing', 'Industry: type 1', 'Industry: type 10',
                   'Industry: type 11', 'Industry: type 12',
                   'Industry: type 13', 'Industry: type 2', 'Industry: type 3',
                   'Industry: type 4', 'Industry: type 5', 'Industry: type 6',
                   'Industry: type 7', 'Industry: type 8', 'Industry: type 9',
                   'Insurance', 'Kindergarten', 'Legal Services', 'Medicine',
                   'Military', 'Mobile', 'Other', 'Police', 'Postal', 'Realtor',
                   'Religion', 'Restaurant', 'School', 'Security',
                   'Security Ministries', 'Self-employed', 'Services',
                   'Telecom', 'Trade: type 1', 'Trade: type 2', 'Trade: type 3',
                   'Trade: type 4', 'Trade: type 5', 'Trade: type 6',
                   'Trade: type 7', 'Transport: type 1', 'Transport: type 2',
                   'Transport: type 3', 'Transport: type 4', 'University',
                   'XNA'],
 , ordered=False, categories_dtype=object),
 'EXT_SOURCE_1': dtype('float64'),
 'EXT_SOURCE_2': dtype('float64'),
 'EXT_SOURCE_3': dtype('float64'),
 'APARTMENTS_AVG': dtype('float64'),
 'BASEMENTAREA_AVG': dtype('float64'),
 'YEARS_BEGINEXPLUATATION_AVG': dtype('float64'),
 'YEARS_BUILD_AVG': dtype('float64'),
 'COMMONAREA_AVG': dtype('float64'),
 'ELEVATORS_AVG': dtype('float64'),
 'ENTRANCES_AVG': dtype('float64'),
 'FLOORSMAX_AVG': dtype('float64'),
 'FLOORSMIN_AVG': dtype('float64'),
 'LANDAREA_AVG': dtype('float64'),
 'LIVINGAPARTMENTS_AVG': dtype('float64'),
 'LIVINGAREA_AVG': dtype('float64'),
 'NONLIVINGAPARTMENTS_AVG': dtype('float64'),
 'NONLIVINGAREA_AVG': dtype('float64'),
 'APARTMENTS_MODE': dtype('float64'),
 'BASEMENTAREA_MODE': dtype('float64'),
 'YEARS_BEGINEXPLUATATION_MODE': dtype('float64'),
 'YEARS_BUILD_MODE': dtype('float64'),
 'COMMONAREA_MODE': dtype('float64'),
 'ELEVATORS_MODE': dtype('float64'),
 'ENTRANCES_MODE': dtype('float64'),
 'FLOORSMAX_MODE': dtype('float64'),
 'FLOORSMIN_MODE': dtype('float64'),
 'LANDAREA_MODE': dtype('float64'),
 'LIVINGAPARTMENTS_MODE': dtype('float64'),
 'LIVINGAREA_MODE': dtype('float64'),
 'NONLIVINGAPARTMENTS_MODE': dtype('float64'),
 'NONLIVINGAREA_MODE': dtype('float64'),
 'APARTMENTS_MEDI': dtype('float64'),
 'BASEMENTAREA_MEDI': dtype('float64'),
 'YEARS_BEGINEXPLUATATION_MEDI': dtype('float64'),
 'YEARS_BUILD_MEDI': dtype('float64'),
 'COMMONAREA_MEDI': dtype('float64'),
 'ELEVATORS_MEDI': dtype('float64'),
 'ENTRANCES_MEDI': dtype('float64'),
 'FLOORSMAX_MEDI': dtype('float64'),
 'FLOORSMIN_MEDI': dtype('float64'),
 'LANDAREA_MEDI': dtype('float64'),
 'LIVINGAPARTMENTS_MEDI': dtype('float64'),
 'LIVINGAREA_MEDI': dtype('float64'),
 'NONLIVINGAPARTMENTS_MEDI': dtype('float64'),
 'NONLIVINGAREA_MEDI': dtype('float64'),
 'FONDKAPREMONT_MODE': CategoricalDtype(categories=['not specified', 'org spec account', 'reg oper account',
                   'reg oper spec account'],
 , ordered=False, categories_dtype=object),
 'HOUSETYPE_MODE': CategoricalDtype(categories=['block of flats', 'specific housing', 'terraced house'], ordered=False, categories_dtype=object),
 'TOTALAREA_MODE': dtype('float64'),
 'WALLSMATERIAL_MODE': CategoricalDtype(categories=['Block', 'Mixed', 'Monolithic', 'Others', 'Panel',
                   'Stone, brick', 'Wooden'],
 , ordered=False, categories_dtype=object),
 'EMERGENCYSTATE_MODE': CategoricalDtype(categories=['No', 'Yes'], ordered=False, categories_dtype=object),
 'OBS_30_CNT_SOCIAL_CIRCLE': dtype('float64'),
 'DEF_30_CNT_SOCIAL_CIRCLE': dtype('float64'),
 'OBS_60_CNT_SOCIAL_CIRCLE': dtype('float64'),
 'DEF_60_CNT_SOCIAL_CIRCLE': dtype('float64'),
 'DAYS_LAST_PHONE_CHANGE': dtype('float64'),
 'FLAG_DOCUMENT_2': dtype('int64'),
 'FLAG_DOCUMENT_3': dtype('int64'),
 'FLAG_DOCUMENT_4': dtype('int64'),
 'FLAG_DOCUMENT_5': dtype('int64'),
 'FLAG_DOCUMENT_6': dtype('int64'),
 'FLAG_DOCUMENT_7': dtype('int64'),
 'FLAG_DOCUMENT_8': dtype('int64'),
 'FLAG_DOCUMENT_9': dtype('int64'),
 'FLAG_DOCUMENT_10': dtype('int64'),
 'FLAG_DOCUMENT_11': dtype('int64'),
 'FLAG_DOCUMENT_12': dtype('int64'),
 'FLAG_DOCUMENT_13': dtype('int64'),
 'FLAG_DOCUMENT_14': dtype('int64'),
 'FLAG_DOCUMENT_15': dtype('int64'),
 'FLAG_DOCUMENT_16': dtype('int64'),
 'FLAG_DOCUMENT_17': dtype('int64'),
 'FLAG_DOCUMENT_18': dtype('int64'),
 'FLAG_DOCUMENT_19': dtype('int64'),
 'FLAG_DOCUMENT_20': dtype('int64'),
 'FLAG_DOCUMENT_21': dtype('int64'),
 'AMT_REQ_CREDIT_BUREAU_HOUR': dtype('float64'),
 'AMT_REQ_CREDIT_BUREAU_DAY': dtype('float64'),
 'AMT_REQ_CREDIT_BUREAU_WEEK': dtype('float64'),
 'AMT_REQ_CREDIT_BUREAU_MON': dtype('float64'),
 'AMT_REQ_CREDIT_BUREAU_QRT': dtype('float64'),
 'AMT_REQ_CREDIT_BUREAU_YEAR': dtype('float64')}

Algunas variables clasificadas inicialmente como enteras son de carácter booleano, lo que las hace categóricas.

SEPARACIÓN EN TRAIN Y TEST ESTRATIFICADO

In [12]:
pd_plot_target = pd_loan['TARGET'].value_counts(normalize=False).reset_index()

pd_plot_target.columns = ['TARGET', 'count']

fig = px.bar(pd_plot_target, x='TARGET', y='count')
fig.update_xaxes(tickvals=[0, 1])
fig.show()

Se observa que el muestreo está desbalanceado, ya que la mayoría de los clientes no tienen dificultades para pagar el préstamo. (0 = pagado y 1 = no pagado) El 20% de los datos se asignan al conjunto de prueba y el 80% al conjunto de entrenamiento para entrenar el modelo.

In [13]:
from sklearn.model_selection import train_test_split
X_pd_loan, X_pd_loan_test, y_pd_loan, y_pd_loan_test = train_test_split(pd_loan.drop('TARGET',axis=1), 
                                                                     pd_loan['TARGET'], 
                                                                     stratify=pd_loan['TARGET'], 
                                                                     test_size=0.2)
pd_loan_train = pd.concat([X_pd_loan, y_pd_loan],axis=1)
pd_loan_test = pd.concat([X_pd_loan_test, y_pd_loan_test],axis=1)
In [14]:
print('== Train\n', pd_loan_train['TARGET'].value_counts(normalize=True))
print('== Test\n', pd_loan_test['TARGET'].value_counts(normalize=True))
== Train
 TARGET
0    0.919271
1    0.080729
Name: proportion, dtype: float64
== Test
 TARGET
0    0.919272
1    0.080728
Name: proportion, dtype: float64

Como observamos en el resultado, cada categoría de la variable target tiene la misma proporción tanto en el train como en el test, lo que permite que el modelo tenga ejemplos de todas las clases en ambos conjuntos y asi evita que el modelo tenga un sesgo hacia una clase en particular.

VISUALIZACIÓN DESCRIPTIVA DE LOS DATOS

Aquí vamos a realizar la suma de la cantidad de valores nulos que hay por filas y por columnas.

pd_null_columnas contiene el número de valores nulos por columna y el % de valores nulos respecto al total de filas. pd_null_filas contiene el número de valores nulos por filas y el porcentaje de valores nulos respecto al total de columnas, y la relación con la variable TARGET

In [15]:
#Veo el número de valores nulos por filas y columnas
pd_series_null_columns = pd_loan_train.isnull().sum().sort_values(ascending=False)
pd_series_null_rows = pd_loan_train.isnull().sum(axis=1).sort_values(ascending=False)
print(pd_series_null_columns.shape, pd_series_null_rows.shape)

pd_null_columnas = pd.DataFrame(pd_series_null_columns, columns=['nulos_columnas'])     
pd_null_filas = pd.DataFrame(pd_series_null_rows, columns=['nulos_filas'])  
pd_null_filas['TARGET'] = pd_loan['TARGET'].copy()
pd_null_columnas['porcentaje_columnas'] = pd_null_columnas['nulos_columnas']/pd_loan_train.shape[0]
pd_null_filas['porcentaje_filas']= pd_null_filas['nulos_filas']/pd_loan_train.shape[1]
(121,) (246008,)
In [16]:
pd_null_columnas.head()
Out[16]:
nulos_columnas porcentaje_columnas
COMMONAREA_MODE 171913 0.698811
COMMONAREA_MEDI 171913 0.698811
COMMONAREA_AVG 171913 0.698811
NONLIVINGAPARTMENTS_MEDI 170833 0.694421
NONLIVINGAPARTMENTS_MODE 170833 0.694421
In [17]:
pd_null_filas.head()
Out[17]:
nulos_filas TARGET porcentaje_filas
SK_ID_CURR
274127 61 0 0.504132
255145 61 0 0.504132
412312 61 0 0.504132
329262 61 0 0.504132
180861 61 0 0.504132

Hemos hecho un conteo y porcentaje de valores nulos por filas y columnas para identificar qué columnas y filas tienen más valores nulos por si hubiese que tomar la decisión de imputarlas o eliminarlas

In [18]:
list_var_cat
Out[18]:
['NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'NAME_TYPE_SUITE',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'OCCUPATION_TYPE',
 'WEEKDAY_APPR_PROCESS_START',
 'ORGANIZATION_TYPE',
 'FONDKAPREMONT_MODE',
 'HOUSETYPE_MODE',
 'WALLSMATERIAL_MODE',
 'EMERGENCYSTATE_MODE']
In [19]:
#Realizamos un bucle para las variables categóricas

for variable in list_var_cat:
    if variable != "TARGET":
        fig, axes = plt.subplots(1, 2, figsize=(12, 6))

        # Gráfico de conteo
        sns.countplot(data=pd_loan_train, x=variable, ax=axes[0])
        axes[0].set_title(f'{variable} Número de nulos: {pd_loan_train[variable].isna().sum()}')
        axes[0].set_ylabel('Count')

        # Rotar etiquetas del eje X
        axes[0].tick_params(axis='x', rotation=45)

        # Para cada categoría: gráfico de fracciones
        pd_loan_df = pd_loan_train.groupby([variable, 'TARGET']).size().reset_index(name='count')
        pd_loan_df['fraction'] = pd_loan_df['count'] / pd_loan_df.groupby(variable)['count'].transform('sum')

        sns.barplot(data=pd_loan_df, x=variable, y='fraction', hue='TARGET', ax=axes[1])
        axes[1].set_title(f'{variable} vs TARGET')
        axes[1].set_ylabel('TARGET fraction')

        # Rotar etiquetas del eje X
        axes[1].tick_params(axis='x', rotation=45)

        # Ajustar diseño
        plt.tight_layout()
        plt.show()
C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:16: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:17: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

No description has been provided for this image
C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:16: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:17: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

No description has been provided for this image
C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:16: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:17: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

No description has been provided for this image
C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:16: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:17: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

No description has been provided for this image
C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:16: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:17: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

No description has been provided for this image
C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:16: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:17: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

No description has been provided for this image
C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:16: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:17: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

No description has been provided for this image
C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:16: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:17: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

No description has been provided for this image
C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:16: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:17: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

No description has been provided for this image
C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:16: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:17: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

No description has been provided for this image
C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:16: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:17: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

No description has been provided for this image
C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:16: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:17: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

No description has been provided for this image
C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:16: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:17: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

No description has been provided for this image
C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:16: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:17: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

No description has been provided for this image
C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:16: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:17: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

No description has been provided for this image
C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:16: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:17: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

No description has been provided for this image

A partir de estos gráficos, podemos ver a la izquierda el número de valores nulos para cada categoría de las variables, y en el de la derecha, cómo se distribuye una variable en función de la variable TARGET. Por lo general, vemos que los datos están bastante desbalanceados, pues hay muchas variables que no vemos que tengan un impacto fuerte en el impago del préstamo. A continuación, vamos a comentar algunas variables que puedan ser significativas en el impago del préstamo.

NAME_INCOME_TYPE (Situación laboral del solicitante) En el gráfico de la izquierda vemos que la mayoría de los datos se asocian con 'Working', es decir, personas que tienen un empleo. Otras categorías presentan también una parte de los datos, como 'Pensioner' o 'Commercial Associate'. Otras categorías como 'Student' o 'Unemployed' presentan muy poca distribución de los datos, o ninguna. En el gráfico de la derecha, que muestra la fracción de incumplimiento para cada categoría, vemos que las personas desempleadas o las que están de baja por maternidad, tienen un porcentaje alto de incumplimiento. Los desempleados parece lógico, pues se asocia con la falta de un trabajo estable. Otras categorías como los pensionistas o los trabajadores presentan un grado bajo de incumplimiento, pues están asociados con un salario o pensión.

NAME_EDUCATION_TYPE(Nivel de educación) En el gráfico de la izquierda vemos que los datos se distribuyen sobre todo en clientes que tienen 'Secondary/Secondary Special', seguidos de 'Higher Education'. 'Academid degree' o 'Lower Secundary' son menos comunes.En el gráfico de la derecha se muestra las fracciones de incumplimiento para cada nivel educativo. Como vemos, secundaria, secundaria especial o secundaria baja son los niveles que más probabilidad de incumplimiento tienen, algo que puede estar relacionado por el tipo de empleo que estos clientes tienen, asociado con la baja formación académica recibida. Los clientes con un grado o educación superior presentan menor riesgo de incumplimiento, algo que también puede estar relacionado por el caso contrario al anterior.Podemos concluir que el nivel educativo influye en la capacidad de incumplimiento del solicitante.

NAME_HOUSING_TYPE(Tipo de vivienda) En el gráfico de la izquierda vemos que la mayoría de los datos se asocian con clientes que vivene un apartameto o casa, que por otra parte es lo más normal. Otras categorías presentan una distribución de los datos mucho menor, como 'Municipal Apartment', 'With Parents' o 'Rented Apartment'. En cuanto a su relación con TARGET, vemos que la fracción de incumplimineto es algo mayor para aquellos solicitantes que viven con los padres o de alquiler. Esto puede estar relacionado con un menor grado de cumplimineto, pues los que viven con sus padres es posible que no tengan una ocupación, aunque también pueden ser aún jóvenes para independizarse. Aun así, no parece una variable muy significativa en cuanto al grado de incumplimiento del préstamo.

OCCUPATYON_TYPE (Tipo de trabajo) Al observar la variable, vemos que presenta una buena cantidad de nulos que podríamos imputar para mejorar el análisis. En el gráfico de la izquierda, vemos que la mayoría de los solicitantes pertenecen a 'Laborers'(en mayor proporción), 'Core Staff' y 'Sales Staff'. Como vemos en el gráfico de la derecha, los trabajadores dedicados a actividades manuales o físicas como la construcción, fabricación, agaricultura, así como aquellos con una baja especialización, como por ejemplo, ayudante de obra, presentan mayor incumplimineto de devolver el préstamo. Otros gremios, como los conductores o los vigilantes de seguridad presentan un grado de incumplimiento menor, aunque mayor que el resto. Esto puede estar debido a la formación educativa, ya que estos trabajos no requieren de una cualificación para dedicarse a ellos. Podemos concluir que el tipo de ocupación puede influir en el grado de incumplimiento del préstamo.

CODE_GENDER(Género) Como vemos en el gráfico de la izquierda, hay mayor proporción de clientes femeninos que masculinos.En el gráfico de la derecha no parece haber una diferencia extremadamente significativa entre el grado de incumplimineto de hombres y mujeres, aunque si nos fijamos el grado de incumplimiento mayor es ligeramente masculino. Esto puede deberse a muchas razones como el tipo de trabajo, mentalidad o tipo de educación, entre otras. En conclusión, esta variable tiene una relación moderada con la probabilidad de incumplkimiento, siendo importante considerar cómoa actúa con otras variables.

NAME_CONTRACT_TYPE (Tipo de contrato) En el gráfico de la izquierda vemos que la mayoría de los préstamos pertenecen a 'Cash Loans', mientras que los 'Revolving Loans' tienen una proproción mucho menor. Con respecto a la variable TARGET, vemos que no hay una gran diferencia en la probabilidad de incumplimiento entre ambos préstamos, pero sí que es verdad que los 'Cash Loans' tienen una probabilidad mayor, cosa que tiene sentido al ser mucho más comunes en el día a día.

In [20]:
for variable in list_var_continuous:  # Lista de variables continuas
    if variable != "TARGET":
        fig, axes = plt.subplots(1, 2, figsize=(12, 6))

        # Histograma
        sns.histplot(data=pd_loan_train, x=variable, kde=True, ax=axes[0], bins=30)
        axes[0].set_title(f'{variable} Número de nulos: {pd_loan_train[variable].isna().sum()}')
        axes[0].set_ylabel('Frequency')

        # Boxplot vs TARGET
        sns.boxplot(data=pd_loan_train, x='TARGET', y=variable, ax=axes[1])
        axes[1].set_title(f'{variable} vs TARGET')
        axes[1].set_ylabel(variable)
        axes[1].set_xlabel('TARGET')

        # Ajustar diseño
        plt.tight_layout()
        plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Una vez realizados los gráficos de las variables continuas con la variable objetivo (TARGET), procederemos a realizar algunos comentarios de algunas variables que puedan influir en la misma, así como algunas que no sean tan significativas para tener un ejemplo de todo. A continuación, veremos cuáles son las variables elegidas.

OWN_CAR_AGE (Edad del coche propio). Como vemos en el gráfico de la izquierda, hay un número considerable de valores nulos, concretamente 162.426, lo que indica que muchas personas no poseen automóvil o no hay información a cerca de ellos. Esto se debería gestionar mediante, por ejemplo, con la imputación o creando una categoría específica para los que no tienen automóvil. La mayoría de los coches tienen entre 0 y 20 años, con una concentración importante sobre todo de 0 a 10. No hay casi valores más allá de los 40 años, solo unos pocos datos sobrepasan los 60 años. En cuanto a su relación con TARGET, vemos en los boxplots que no hay grandes diferencias en la mediana ni en el rango intercuartílico para 0 y para 1, lo que nos puede indicar que no el coche propio puede no ser un factor influyente en cuanto a devolver o no el crédito.

EXT_SOURCE_1 (Puntuaciones de riesgo crediticio para los solicitantes que provienen de fuentes externas). En el gráfico de la izquierda se puede observar que la variable sigue una distribución simétrica con forma similar a una distribución normal. Esta variable tiene una elevada cantidad de valores lo que indica que puede ser útil para discriminar entre distintos tipos de solicitantes. En cuanto a su relación con TARGET, observamos que el boxplot muestra que la mediana para los clientes que no incumplen (TARGET=0) es algo mayor que la de los que sí incumplen (TARGET=1) tienden a tener valores más altos, lo que puede decir que esta variable está asociada con un menor riesgo de incumplimiento.El rango intercuartílico es algo mayor también en los clientes cumplidores que en los que no. Podemos decir que esta variable puede tener un cierto poder predictivo.Los valores altos de EXT_SOURCE_1 se asocian con un menor riesgo de incumplimiento.

CNT_FAM_MEMBERS (Numero de miembros de la familia). En el gráfico de la izquierda vemos que la variable posee 2 valores nulos, algo que no es significativo dentro del total de datos. En el histograma se muestra que la mayoría de los datos se encuentran entre 2 y 4 miembros, concentrándose sobre todo en 2, sugiriendo que una gran parte de las familias son pequeñas. En relación con TARGET, el boxplot muestra que la distribución es similiar tanto para los cumplidores como para los que no. Tanto la mediana como el rango intercuartílico son iguales, lo que indica que no hay una gran relación con la probabilidad de impago. Hay algunas familias con un número elevado de miembros, aunque no parece que tengan un efecto significativo.

AMT_INCOME_TOTAL (Ingresos totales). Esta variable no posee valores nulos, por lo que podría ser representativa. En cuanto al histograma, la distribución de la variable está sesgada hacia la izquierda, lo que indica que la mayoría de clientes tienen ingresos bajos. La distribuación de los ingresos para TARGET=0 y TARGET=1 es muy similar, La mediana y el rango intercuartílico son también casi iguales, lo que indica que el nivel de ingresos no tiene una relación directa con la posibilidad de impago.

EXT_SOURCE_2 (Puntuaciones de riesgo crediticio para los solicitantes que provienen de fuentes externas) En este caso, presenta pocos valores nulos con respecto al total de los datos, aunque se podrían imputar para no perder demasiada información. En el histograma vemos que la mayoría de los valores están concentrados entre 0.4 y 0.8, es decir, sesgados a la derecha, mientras que los valores extremos son menos comunes.En cuanto a su relación con TARGET, vemos que los valores más altos están relacionados con los clientes que no incumplen. La mediana es mas alta para los clientes que no incumplen, lo que un valor más alto de la variable podría estar asociado con un menor riesgo de incumplimiento. El rango intercuartílico es bastante parecido, aunque se observa que los valores bajos se concentran más en los clientes que presentan incumplimiento, por lo que podemos decir que esta variable puede ser un buen predictor.

EXT_SOURCE_3 (Puntuaciones de riesgo crediticio para los solicitantes que provienen de fuentes externas) Al igual que en las variables anteriores como EXT_SOURCE_1 y EXT_SOURCE_2, prsenta los valores más altos asocidados a clientes que no incumplen el préstamo, lo que indica que valores bajos poseen un mayor riesgo de incumplimiento. Por lo tanto, podemos concluir que la información externa sobre clientes solicitantes de préstamo es una buena variable predictora.

TRATAMIENTO DE VARIABLES CONTINUAS

A continuación se tratarán tres aspectos clave del análisis de los datos: los valores missing, los outliers y las correlaciones entre las variables continuas

In [21]:
list_var_continuous
Out[21]:
['AMT_INCOME_TOTAL',
 'AMT_CREDIT',
 'AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'REGION_POPULATION_RELATIVE',
 'DAYS_REGISTRATION',
 'OWN_CAR_AGE',
 'CNT_FAM_MEMBERS',
 'EXT_SOURCE_1',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'APARTMENTS_AVG',
 'BASEMENTAREA_AVG',
 'YEARS_BEGINEXPLUATATION_AVG',
 'YEARS_BUILD_AVG',
 'COMMONAREA_AVG',
 'ELEVATORS_AVG',
 'ENTRANCES_AVG',
 'FLOORSMAX_AVG',
 'FLOORSMIN_AVG',
 'LANDAREA_AVG',
 'LIVINGAPARTMENTS_AVG',
 'LIVINGAREA_AVG',
 'NONLIVINGAPARTMENTS_AVG',
 'NONLIVINGAREA_AVG',
 'APARTMENTS_MODE',
 'BASEMENTAREA_MODE',
 'YEARS_BEGINEXPLUATATION_MODE',
 'YEARS_BUILD_MODE',
 'COMMONAREA_MODE',
 'ELEVATORS_MODE',
 'ENTRANCES_MODE',
 'FLOORSMAX_MODE',
 'FLOORSMIN_MODE',
 'LANDAREA_MODE',
 'LIVINGAPARTMENTS_MODE',
 'LIVINGAREA_MODE',
 'NONLIVINGAPARTMENTS_MODE',
 'NONLIVINGAREA_MODE',
 'APARTMENTS_MEDI',
 'BASEMENTAREA_MEDI',
 'YEARS_BEGINEXPLUATATION_MEDI',
 'YEARS_BUILD_MEDI',
 'COMMONAREA_MEDI',
 'ELEVATORS_MEDI',
 'ENTRANCES_MEDI',
 'FLOORSMAX_MEDI',
 'FLOORSMIN_MEDI',
 'LANDAREA_MEDI',
 'LIVINGAPARTMENTS_MEDI',
 'LIVINGAREA_MEDI',
 'NONLIVINGAPARTMENTS_MEDI',
 'NONLIVINGAREA_MEDI',
 'TOTALAREA_MODE',
 'OBS_30_CNT_SOCIAL_CIRCLE',
 'DEF_30_CNT_SOCIAL_CIRCLE',
 'OBS_60_CNT_SOCIAL_CIRCLE',
 'DEF_60_CNT_SOCIAL_CIRCLE',
 'DAYS_LAST_PHONE_CHANGE',
 'AMT_REQ_CREDIT_BUREAU_HOUR',
 'AMT_REQ_CREDIT_BUREAU_DAY',
 'AMT_REQ_CREDIT_BUREAU_WEEK',
 'AMT_REQ_CREDIT_BUREAU_MON',
 'AMT_REQ_CREDIT_BUREAU_QRT',
 'AMT_REQ_CREDIT_BUREAU_YEAR']
In [22]:
def get_deviation_of_mean_perc(pd_loan, list_var_continuous, target, multiplier):
    '''
    ----------------------------------------------------------------------------------------------------------
    Función get_deviation_of_mean_perc:
    ----------------------------------------------------------------------------------------------------------
    - Descripción:
        Esta función calcula el porcentaje de valores que se encuentran fuera de un 
        intervalo de confianza, determinado por la media y una desviación estándar 
        multiplicada por un factor (multiplier), para cada variable continua en el 
        DataFrame. Luego, analiza la relación entre estos valores atípicos y la variable 
        objetivo (TARGET), y devuelve un resumen con los porcentajes de valores atípicos 
        y su distribución en relación con la variable objetivo.
        
    - Inputs: 
        - pd_loan (DataFrame): DataFrame que contiene los datos de entrada.
        - list_var_continuous (list): Lista con los nombres de las variables continuas 
          a analizar.
        - target (str): Nombre de la variable objetivo en el DataFrame.
        - multiplier (float): Factor multiplicador para calcular el intervalo de confianza
          (desviación estándar).
        
    - Return:
        - pd_final (DataFrame): DataFrame que contiene el porcentaje de valores atípicos 
          por cada variable continua, su distribución con respecto a la variable objetivo 
          (TARGET), y otros detalles relevantes.
    '''
    
    pd_final = pd.DataFrame()
    
    for i in list_var_continuous:
        
        series_mean = pd_loan[i].mean()
        series_std = pd_loan[i].std()
        std_amp = multiplier * series_std
        left = series_mean - std_amp
        right = series_mean + std_amp
        size_s = pd_loan[i].size
        
        perc_goods = pd_loan[i][(pd_loan[i] >= left) & (pd_loan[i] <= right)].size/size_s
        perc_excess = pd_loan[i][(pd_loan[i] < left) | (pd_loan[i] > right)].size/size_s
        
        if perc_excess>0:    
            pd_concat_percent = pd.DataFrame(pd_loan[target][(pd_loan[i] < left) | (pd_loan[i] > right)]\
                                            .value_counts(normalize=True).reset_index()).T
            pd_concat_percent.columns = [pd_concat_percent.iloc[0,0], 
                                         pd_concat_percent.iloc[0,1]]

            pd_concat_percent['variable'] = i
            pd_concat_percent['sum_outlier_values'] = pd_loan[i][(pd_loan[i] < left) | (pd_loan[i] > right)].size
            pd_concat_percent['porcentaje_sum_null_values'] = perc_excess
            pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)
            
    if pd_final.empty:
        print('No existen variables con valores nulos')
        
    return pd_final
In [23]:
get_deviation_of_mean_perc(pd_loan_train, list_var_continuous, target='TARGET', multiplier=3)
Out[23]:
0.0 1.0 variable sum_outlier_values porcentaje_sum_null_values
0 0.000000 1.000000 AMT_INCOME_TOTAL 2187 0.008890
1 0.946502 0.053498 AMT_INCOME_TOTAL 2187 0.008890
2 0.000000 1.000000 AMT_CREDIT 2596 0.010553
3 0.957242 0.042758 AMT_CREDIT 2596 0.010553
4 0.000000 1.000000 AMT_ANNUITY 2356 0.009577
5 0.963497 0.036503 AMT_ANNUITY 2356 0.009577
6 0.000000 1.000000 AMT_GOODS_PRICE 3298 0.013406
7 0.960279 0.039721 AMT_GOODS_PRICE 3298 0.013406
8 0.000000 1.000000 REGION_POPULATION_RELATIVE 6698 0.027227
9 0.961033 0.038967 REGION_POPULATION_RELATIVE 6698 0.027227
10 0.000000 1.000000 DAYS_REGISTRATION 608 0.002471
11 0.960526 0.039474 DAYS_REGISTRATION 608 0.002471
12 0.000000 1.000000 OWN_CAR_AGE 2664 0.010829
13 0.920420 0.079580 OWN_CAR_AGE 2664 0.010829
14 0.000000 1.000000 CNT_FAM_MEMBERS 3205 0.013028
15 0.898284 0.101716 CNT_FAM_MEMBERS 3205 0.013028
16 0.000000 1.000000 APARTMENTS_AVG 2345 0.009532
17 0.947548 0.052452 APARTMENTS_AVG 2345 0.009532
18 0.000000 1.000000 BASEMENTAREA_AVG 1587 0.006451
19 0.943289 0.056711 BASEMENTAREA_AVG 1587 0.006451
20 0.000000 1.000000 YEARS_BEGINEXPLUATATION_AVG 553 0.002248
21 0.905967 0.094033 YEARS_BEGINEXPLUATATION_AVG 553 0.002248
22 0.000000 1.000000 YEARS_BUILD_AVG 947 0.003849
23 0.927138 0.072862 YEARS_BUILD_AVG 947 0.003849
24 0.000000 1.000000 COMMONAREA_AVG 1364 0.005545
25 0.949413 0.050587 COMMONAREA_AVG 1364 0.005545
26 0.000000 1.000000 ELEVATORS_AVG 1899 0.007719
27 0.953133 0.046867 ELEVATORS_AVG 1899 0.007719
28 0.000000 1.000000 ENTRANCES_AVG 1746 0.007097
29 0.935281 0.064719 ENTRANCES_AVG 1746 0.007097
30 0.000000 1.000000 FLOORSMAX_AVG 2051 0.008337
31 0.958557 0.041443 FLOORSMAX_AVG 2051 0.008337
32 0.000000 1.000000 FLOORSMIN_AVG 474 0.001927
33 0.970464 0.029536 FLOORSMIN_AVG 474 0.001927
34 0.000000 1.000000 LANDAREA_AVG 1672 0.006797
35 0.935407 0.064593 LANDAREA_AVG 1672 0.006797
36 0.000000 1.000000 LIVINGAPARTMENTS_AVG 1376 0.005593
37 0.946948 0.053052 LIVINGAPARTMENTS_AVG 1376 0.005593
38 0.000000 1.000000 LIVINGAREA_AVG 2534 0.010300
39 0.946725 0.053275 LIVINGAREA_AVG 2534 0.010300
40 0.000000 1.000000 NONLIVINGAPARTMENTS_AVG 577 0.002345
41 0.925477 0.074523 NONLIVINGAPARTMENTS_AVG 577 0.002345
42 0.000000 1.000000 NONLIVINGAREA_AVG 1907 0.007752
43 0.947562 0.052438 NONLIVINGAREA_AVG 1907 0.007752
44 0.000000 1.000000 APARTMENTS_MODE 2346 0.009536
45 0.945865 0.054135 APARTMENTS_MODE 2346 0.009536
46 0.000000 1.000000 BASEMENTAREA_MODE 1648 0.006699
47 0.941141 0.058859 BASEMENTAREA_MODE 1648 0.006699
48 0.000000 1.000000 YEARS_BEGINEXPLUATATION_MODE 542 0.002203
49 0.904059 0.095941 YEARS_BEGINEXPLUATATION_MODE 542 0.002203
50 0.000000 1.000000 YEARS_BUILD_MODE 955 0.003882
51 0.924607 0.075393 YEARS_BUILD_MODE 955 0.003882
52 0.000000 1.000000 COMMONAREA_MODE 1357 0.005516
53 0.944731 0.055269 COMMONAREA_MODE 1357 0.005516
54 0.000000 1.000000 ELEVATORS_MODE 2620 0.010650
55 0.949618 0.050382 ELEVATORS_MODE 2620 0.010650
56 0.000000 1.000000 ENTRANCES_MODE 2084 0.008471
57 0.939060 0.060940 ENTRANCES_MODE 2084 0.008471
58 0.000000 1.000000 FLOORSMAX_MODE 2072 0.008422
59 0.960907 0.039093 FLOORSMAX_MODE 2072 0.008422
60 0.000000 1.000000 FLOORSMIN_MODE 386 0.001569
61 0.968912 0.031088 FLOORSMIN_MODE 386 0.001569
62 0.000000 1.000000 LANDAREA_MODE 1694 0.006886
63 0.935065 0.064935 LANDAREA_MODE 1694 0.006886
64 0.000000 1.000000 LIVINGAPARTMENTS_MODE 1423 0.005784
65 0.944483 0.055517 LIVINGAPARTMENTS_MODE 1423 0.005784
66 0.000000 1.000000 LIVINGAREA_MODE 2658 0.010805
67 0.945824 0.054176 LIVINGAREA_MODE 2658 0.010805
68 0.000000 1.000000 NONLIVINGAPARTMENTS_MODE 544 0.002211
69 0.917279 0.082721 NONLIVINGAPARTMENTS_MODE 544 0.002211
70 0.000000 1.000000 NONLIVINGAREA_MODE 1946 0.007910
71 0.948613 0.051387 NONLIVINGAREA_MODE 1946 0.007910
72 0.000000 1.000000 APARTMENTS_MEDI 2373 0.009646
73 0.947324 0.052676 APARTMENTS_MEDI 2373 0.009646
74 0.000000 1.000000 BASEMENTAREA_MEDI 1592 0.006471
75 0.942839 0.057161 BASEMENTAREA_MEDI 1592 0.006471
76 0.000000 1.000000 YEARS_BEGINEXPLUATATION_MEDI 520 0.002114
77 0.901923 0.098077 YEARS_BEGINEXPLUATATION_MEDI 520 0.002114
78 0.000000 1.000000 YEARS_BUILD_MEDI 958 0.003894
79 0.927975 0.072025 YEARS_BUILD_MEDI 958 0.003894
80 0.000000 1.000000 COMMONAREA_MEDI 1378 0.005601
81 0.948476 0.051524 COMMONAREA_MEDI 1378 0.005601
82 0.000000 1.000000 ELEVATORS_MEDI 2743 0.011150
83 0.950055 0.049945 ELEVATORS_MEDI 2743 0.011150
84 0.000000 1.000000 ENTRANCES_MEDI 1757 0.007142
85 0.934548 0.065452 ENTRANCES_MEDI 1757 0.007142
86 0.000000 1.000000 FLOORSMAX_MEDI 2148 0.008731
87 0.959497 0.040503 FLOORSMAX_MEDI 2148 0.008731
88 0.000000 1.000000 FLOORSMIN_MEDI 441 0.001793
89 0.970522 0.029478 FLOORSMIN_MEDI 441 0.001793
90 0.000000 1.000000 LANDAREA_MEDI 1696 0.006894
91 0.936321 0.063679 LANDAREA_MEDI 1696 0.006894
92 0.000000 1.000000 LIVINGAPARTMENTS_MEDI 1390 0.005650
93 0.945324 0.054676 LIVINGAPARTMENTS_MEDI 1390 0.005650
94 0.000000 1.000000 LIVINGAREA_MEDI 2542 0.010333
95 0.946892 0.053108 LIVINGAREA_MEDI 2542 0.010333
96 0.000000 1.000000 NONLIVINGAPARTMENTS_MEDI 575 0.002337
97 0.926957 0.073043 NONLIVINGAPARTMENTS_MEDI 575 0.002337
98 0.000000 1.000000 NONLIVINGAREA_MEDI 1930 0.007845
99 0.948705 0.051295 NONLIVINGAREA_MEDI 1930 0.007845
100 0.000000 1.000000 TOTALAREA_MODE 2624 0.010666
101 0.953506 0.046494 TOTALAREA_MODE 2624 0.010666
102 0.000000 1.000000 OBS_30_CNT_SOCIAL_CIRCLE 4897 0.019906
103 0.908515 0.091485 OBS_30_CNT_SOCIAL_CIRCLE 4897 0.019906
104 0.000000 1.000000 DEF_30_CNT_SOCIAL_CIRCLE 5444 0.022129
105 0.879133 0.120867 DEF_30_CNT_SOCIAL_CIRCLE 5444 0.022129
106 0.000000 1.000000 OBS_60_CNT_SOCIAL_CIRCLE 4753 0.019321
107 0.908689 0.091311 OBS_60_CNT_SOCIAL_CIRCLE 4753 0.019321
108 0.000000 1.000000 DEF_60_CNT_SOCIAL_CIRCLE 3134 0.012739
109 0.873644 0.126356 DEF_60_CNT_SOCIAL_CIRCLE 3134 0.012739
110 0.000000 1.000000 DAYS_LAST_PHONE_CHANGE 509 0.002069
111 0.950884 0.049116 DAYS_LAST_PHONE_CHANGE 509 0.002069
112 0.000000 1.000000 AMT_REQ_CREDIT_BUREAU_HOUR 1312 0.005333
113 0.916159 0.083841 AMT_REQ_CREDIT_BUREAU_HOUR 1312 0.005333
114 0.000000 1.000000 AMT_REQ_CREDIT_BUREAU_DAY 1206 0.004902
115 0.904643 0.095357 AMT_REQ_CREDIT_BUREAU_DAY 1206 0.004902
116 0.000000 1.000000 AMT_REQ_CREDIT_BUREAU_WEEK 6835 0.027784
117 0.921287 0.078713 AMT_REQ_CREDIT_BUREAU_WEEK 6835 0.027784
118 0.000000 1.000000 AMT_REQ_CREDIT_BUREAU_MON 2588 0.010520
119 0.943199 0.056801 AMT_REQ_CREDIT_BUREAU_MON 2588 0.010520
120 0.000000 1.000000 AMT_REQ_CREDIT_BUREAU_QRT 1850 0.007520
121 0.909730 0.090270 AMT_REQ_CREDIT_BUREAU_QRT 1850 0.007520
122 0.000000 1.000000 AMT_REQ_CREDIT_BUREAU_YEAR 2711 0.011020
123 0.909259 0.090741 AMT_REQ_CREDIT_BUREAU_YEAR 2711 0.011020

CORRELACIONES

In [24]:
def get_corr_matrix(dataset = None, metodo='pearson', size_figure=[10,8]):
    ''''
    ----------------------------------------------------------------------------------------------------------
    Función get_corr_matrix:
    ----------------------------------------------------------------------------------------------------------
    - Descripción:
        Esta función calcula y visualiza la matriz de correlación entre las variables 
        numéricas de un conjunto de datos. 

    - Inputs: 
        - dataset (DataFrame): Conjunto de datos con las variables numéricas a analizar.
        - metodo (str): Método de correlación a utilizar.
        - size_figure (list): Tamaño de la figura del gráfico.

    - Return:
        - None: Muestra un mapa de calor de la matriz de correlación.
    ----------------------------------------------------------------------------------------------------------
    '''
    
    # Comprobación de que se ha proporcionado el dataset
    if dataset is None:
        print(u'\nHace falta pasar argumentos a la función')
        return 1
    sns.set(style="white")
    
    # Calcular la matriz de correlación
    corr = dataset.corr(method=metodo) 
    
    # Establecer la autocorrelación a cero para evitar distracciones
    for i in range(corr.shape[0]):
        corr.iloc[i, i] = 0
    
    f, ax = plt.subplots(figsize=size_figure)
    
    # Dibujar el mapa de calor con la correlación
    sns.heatmap(corr, center=0,
                square=True, linewidths=.5,  cmap ='viridis' ) #cbar_kws={"shrink": .5}
    plt.show()
    
    return 0
In [25]:
get_corr_matrix(dataset = pd_loan_train[list_var_continuous], 
                metodo='pearson', size_figure=[10,8])
No description has been provided for this image
Out[25]:
0

En la matriz de correlación se observan varias correlaciones de Pearson cercanas o iguales a 1, lo que indica que algunas variables están perfectamente correlacionadas. Esto puede deberse a la existencia de variables redundantes o derivadas de otras, lo que podría generar problemas de multicolinealidad en modelos estadísticos. Es recomendable revisar estas variables y, en caso de ser necesario, eliminar las que sean idénticas o altamente correlacionadas para evitar distorsiones en los resultados del modelo.

In [26]:
corr = pd_loan_train[list_var_continuous].corr('pearson')
new_corr = corr.abs()
new_corr.loc[:,:] = np.tril(new_corr, k=-1) # below main lower triangle of an array
new_corr = new_corr.stack().to_frame('correlation').reset_index().sort_values(by='correlation', ascending=False)
new_corr[new_corr['correlation']>0.6]
Out[26]:
level_0 level_1 correlation
3694 OBS_60_CNT_SOCIAL_CIRCLE OBS_30_CNT_SOCIAL_CIRCLE 0.998514
2744 YEARS_BUILD_MEDI YEARS_BUILD_AVG 0.998454
3074 FLOORSMIN_MEDI FLOORSMIN_AVG 0.997279
3008 FLOORSMAX_MEDI FLOORSMAX_AVG 0.997015
2942 ENTRANCES_MEDI ENTRANCES_AVG 0.996893
2876 ELEVATORS_MEDI ELEVATORS_AVG 0.996207
2810 COMMONAREA_MEDI COMMONAREA_AVG 0.995471
3272 LIVINGAREA_MEDI LIVINGAREA_AVG 0.995280
2546 APARTMENTS_MEDI APARTMENTS_AVG 0.995117
2612 BASEMENTAREA_MEDI BASEMENTAREA_AVG 0.994780
3206 LIVINGAPARTMENTS_MEDI LIVINGAPARTMENTS_AVG 0.994544
2678 YEARS_BEGINEXPLUATATION_MEDI YEARS_BEGINEXPLUATATION_AVG 0.994378
3338 NONLIVINGAPARTMENTS_MEDI NONLIVINGAPARTMENTS_AVG 0.991786
3140 LANDAREA_MEDI LANDAREA_AVG 0.991740
3404 NONLIVINGAREA_MEDI NONLIVINGAREA_AVG 0.990674
2758 YEARS_BUILD_MEDI YEARS_BUILD_MODE 0.989445
1834 YEARS_BUILD_MODE YEARS_BUILD_AVG 0.989417
3088 FLOORSMIN_MEDI FLOORSMIN_MODE 0.988434
3022 FLOORSMAX_MEDI FLOORSMAX_MODE 0.988090
196 AMT_GOODS_PRICE AMT_CREDIT 0.986929
2164 FLOORSMIN_MODE FLOORSMIN_AVG 0.985960
2098 FLOORSMAX_MODE FLOORSMAX_AVG 0.985577
2890 ELEVATORS_MEDI ELEVATORS_MODE 0.982941
3154 LANDAREA_MEDI LANDAREA_MODE 0.981883
2956 ENTRANCES_MEDI ENTRANCES_MODE 0.980671
2824 COMMONAREA_MEDI COMMONAREA_MODE 0.979752
1966 ELEVATORS_MODE ELEVATORS_AVG 0.979015
2032 ENTRANCES_MODE ENTRANCES_AVG 0.977723
2626 BASEMENTAREA_MEDI BASEMENTAREA_MODE 0.977227
2560 APARTMENTS_MEDI APARTMENTS_MODE 0.976900
1900 COMMONAREA_MODE COMMONAREA_AVG 0.976325
3352 NONLIVINGAPARTMENTS_MEDI NONLIVINGAPARTMENTS_MODE 0.975725
2230 LANDAREA_MODE LANDAREA_AVG 0.975100
3418 NONLIVINGAREA_MEDI NONLIVINGAREA_MODE 0.974836
3220 LIVINGAPARTMENTS_MEDI LIVINGAPARTMENTS_MODE 0.974520
3286 LIVINGAREA_MEDI LIVINGAREA_MODE 0.974122
1702 BASEMENTAREA_MODE BASEMENTAREA_AVG 0.973261
1768 YEARS_BEGINEXPLUATATION_MODE YEARS_BEGINEXPLUATATION_AVG 0.973241
1636 APARTMENTS_MODE APARTMENTS_AVG 0.973040
2362 LIVINGAREA_MODE LIVINGAREA_AVG 0.971317
2296 LIVINGAPARTMENTS_MODE LIVINGAPARTMENTS_AVG 0.969774
2428 NONLIVINGAPARTMENTS_MODE NONLIVINGAPARTMENTS_AVG 0.967589
2494 NONLIVINGAREA_MODE NONLIVINGAREA_AVG 0.965281
2692 YEARS_BEGINEXPLUATATION_MEDI YEARS_BEGINEXPLUATATION_MODE 0.964416
1376 LIVINGAPARTMENTS_AVG APARTMENTS_AVG 0.943325
3224 LIVINGAPARTMENTS_MEDI APARTMENTS_MEDI 0.941575
3196 LIVINGAPARTMENTS_MEDI APARTMENTS_AVG 0.940442
2300 LIVINGAPARTMENTS_MODE APARTMENTS_MODE 0.936308
2556 APARTMENTS_MEDI LIVINGAPARTMENTS_AVG 0.935321
2570 APARTMENTS_MEDI LIVINGAPARTMENTS_MODE 0.930829
2286 LIVINGAPARTMENTS_MODE APARTMENTS_AVG 0.929717
3467 TOTALAREA_MODE LIVINGAREA_AVG 0.924992
3495 TOTALAREA_MODE LIVINGAREA_MEDI 0.919302
3289 LIVINGAREA_MEDI APARTMENTS_MEDI 0.915742
1441 LIVINGAREA_AVG APARTMENTS_AVG 0.913186
3210 LIVINGAPARTMENTS_MEDI APARTMENTS_MODE 0.912645
2557 APARTMENTS_MEDI LIVINGAREA_AVG 0.912046
3261 LIVINGAREA_MEDI APARTMENTS_AVG 0.912033
2365 LIVINGAREA_MODE APARTMENTS_MODE 0.910072
1646 APARTMENTS_MODE LIVINGAPARTMENTS_AVG 0.906915
3481 TOTALAREA_MODE LIVINGAREA_MODE 0.899615
2571 APARTMENTS_MEDI LIVINGAREA_MODE 0.895640
3275 LIVINGAREA_MEDI APARTMENTS_MODE 0.893727
2351 LIVINGAREA_MODE APARTMENTS_AVG 0.893138
3456 TOTALAREA_MODE APARTMENTS_AVG 0.891911
1647 APARTMENTS_MODE LIVINGAREA_AVG 0.890055
3299 LIVINGAREA_MEDI LIVINGAPARTMENTS_MEDI 0.885788
3484 TOTALAREA_MODE APARTMENTS_MEDI 0.885684
3207 LIVINGAPARTMENTS_MEDI LIVINGAREA_AVG 0.883829
1451 LIVINGAREA_AVG LIVINGAPARTMENTS_AVG 0.882129
3271 LIVINGAREA_MEDI LIVINGAPARTMENTS_AVG 0.879969
2375 LIVINGAREA_MODE LIVINGAPARTMENTS_MODE 0.879610
3285 LIVINGAREA_MEDI LIVINGAPARTMENTS_MODE 0.875667
2297 LIVINGAPARTMENTS_MODE LIVINGAREA_AVG 0.874129
3294 LIVINGAREA_MEDI ELEVATORS_MEDI 0.867910
1446 LIVINGAREA_AVG ELEVATORS_AVG 0.867106
2882 ELEVATORS_MEDI LIVINGAREA_AVG 0.865079
3266 LIVINGAREA_MEDI ELEVATORS_AVG 0.865038
3470 TOTALAREA_MODE APARTMENTS_MODE 0.862571
3760 DEF_60_CNT_SOCIAL_CIRCLE DEF_30_CNT_SOCIAL_CIRCLE 0.861250
3221 LIVINGAPARTMENTS_MEDI LIVINGAREA_MODE 0.857528
2370 LIVINGAREA_MODE ELEVATORS_MODE 0.855572
3280 LIVINGAREA_MEDI ELEVATORS_MODE 0.855441
2361 LIVINGAREA_MODE LIVINGAPARTMENTS_AVG 0.852440
1972 ELEVATORS_MODE LIVINGAREA_AVG 0.851989
3466 TOTALAREA_MODE LIVINGAPARTMENTS_AVG 0.847788
3494 TOTALAREA_MODE LIVINGAPARTMENTS_MEDI 0.845127
3461 TOTALAREA_MODE ELEVATORS_AVG 0.844243
2896 ELEVATORS_MEDI LIVINGAREA_MODE 0.840698
2356 LIVINGAREA_MODE ELEVATORS_AVG 0.838606
3489 TOTALAREA_MODE ELEVATORS_MEDI 0.837394
2899 ELEVATORS_MEDI APARTMENTS_MEDI 0.836163
1051 ELEVATORS_AVG APARTMENTS_AVG 0.835854
2871 ELEVATORS_MEDI APARTMENTS_AVG 0.833887
2551 APARTMENTS_MEDI ELEVATORS_AVG 0.833254
3480 TOTALAREA_MODE LIVINGAPARTMENTS_MODE 0.832763
1975 ELEVATORS_MODE APARTMENTS_MODE 0.824885
2565 APARTMENTS_MEDI ELEVATORS_MODE 0.824303
1961 ELEVATORS_MODE APARTMENTS_AVG 0.821279
3475 TOTALAREA_MODE ELEVATORS_MODE 0.820252
3229 LIVINGAPARTMENTS_MEDI ELEVATORS_MEDI 0.814055
3201 LIVINGAPARTMENTS_MEDI ELEVATORS_AVG 0.812459
1381 LIVINGAPARTMENTS_AVG ELEVATORS_AVG 0.812266
2881 ELEVATORS_MEDI LIVINGAPARTMENTS_AVG 0.809730
2885 ELEVATORS_MEDI APARTMENTS_MODE 0.807515
2305 LIVINGAPARTMENTS_MODE ELEVATORS_MODE 0.807429
1641 APARTMENTS_MODE ELEVATORS_AVG 0.804603
3215 LIVINGAPARTMENTS_MEDI ELEVATORS_MODE 0.799160
2895 ELEVATORS_MEDI LIVINGAPARTMENTS_MODE 0.798714
2291 LIVINGAPARTMENTS_MODE ELEVATORS_AVG 0.797127
1971 ELEVATORS_MODE LIVINGAPARTMENTS_AVG 0.794240
197 AMT_GOODS_PRICE AMT_ANNUITY 0.775038
131 AMT_ANNUITY AMT_CREDIT 0.769993
1253 FLOORSMIN_AVG FLOORSMAX_AVG 0.740499
3101 FLOORSMIN_MEDI FLOORSMAX_MEDI 0.738489
3009 FLOORSMAX_MEDI FLOORSMIN_AVG 0.738135
3073 FLOORSMIN_MEDI FLOORSMAX_AVG 0.737900
3087 FLOORSMIN_MEDI FLOORSMAX_MODE 0.727306
2099 FLOORSMAX_MODE FLOORSMIN_AVG 0.726754
2177 FLOORSMIN_MODE FLOORSMAX_MODE 0.723838
3023 FLOORSMAX_MEDI FLOORSMIN_MODE 0.721085
2163 FLOORSMIN_MODE FLOORSMAX_AVG 0.720300
1442 LIVINGAREA_AVG BASEMENTAREA_AVG 0.692622
2622 BASEMENTAREA_MEDI LIVINGAREA_AVG 0.692100
3290 LIVINGAREA_MEDI BASEMENTAREA_MEDI 0.691377
2366 LIVINGAREA_MODE BASEMENTAREA_MODE 0.691004
3262 LIVINGAREA_MEDI BASEMENTAREA_AVG 0.689465
2636 BASEMENTAREA_MEDI LIVINGAREA_MODE 0.680749
2639 BASEMENTAREA_MEDI APARTMENTS_MEDI 0.680550
1186 FLOORSMAX_AVG ELEVATORS_AVG 0.680214
791 BASEMENTAREA_AVG APARTMENTS_AVG 0.679687
1715 BASEMENTAREA_MODE APARTMENTS_MODE 0.679153
2611 BASEMENTAREA_MEDI APARTMENTS_AVG 0.679040
2547 APARTMENTS_MEDI BASEMENTAREA_AVG 0.678465
3006 FLOORSMAX_MEDI ELEVATORS_AVG 0.677866
2352 LIVINGAREA_MODE BASEMENTAREA_AVG 0.677636
2878 ELEVATORS_MEDI FLOORSMAX_AVG 0.676166
3034 FLOORSMAX_MEDI ELEVATORS_MEDI 0.675587
1712 BASEMENTAREA_MODE LIVINGAREA_AVG 0.673834
3276 LIVINGAREA_MEDI BASEMENTAREA_MODE 0.673600
3457 TOTALAREA_MODE BASEMENTAREA_AVG 0.672591
2096 FLOORSMAX_MODE ELEVATORS_AVG 0.671292
3485 TOTALAREA_MODE BASEMENTAREA_MEDI 0.669373
2892 ELEVATORS_MEDI FLOORSMAX_MODE 0.669097
2625 BASEMENTAREA_MEDI APARTMENTS_MODE 0.668838
1637 APARTMENTS_MODE BASEMENTAREA_AVG 0.666304
2561 APARTMENTS_MEDI BASEMENTAREA_MODE 0.663966
1701 BASEMENTAREA_MODE APARTMENTS_AVG 0.661561
2110 FLOORSMAX_MODE ELEVATORS_MODE 0.660848
1968 ELEVATORS_MODE FLOORSMAX_AVG 0.656186
3020 FLOORSMAX_MEDI ELEVATORS_MODE 0.655573
2301 LIVINGAPARTMENTS_MODE BASEMENTAREA_MODE 0.654589
2041 ENTRANCES_MODE BASEMENTAREA_MODE 0.652464
2635 BASEMENTAREA_MEDI LIVINGAPARTMENTS_MODE 0.651936
1707 BASEMENTAREA_MODE ENTRANCES_AVG 0.651788
3225 LIVINGAPARTMENTS_MEDI BASEMENTAREA_MEDI 0.651407
2617 BASEMENTAREA_MEDI ENTRANCES_AVG 0.650922
2951 ENTRANCES_MEDI BASEMENTAREA_MODE 0.650752
3471 TOTALAREA_MODE BASEMENTAREA_MODE 0.649395
2965 ENTRANCES_MEDI BASEMENTAREA_MEDI 0.649370
1117 ENTRANCES_AVG BASEMENTAREA_AVG 0.649305
3197 LIVINGAPARTMENTS_MEDI BASEMENTAREA_AVG 0.648881
2287 LIVINGAPARTMENTS_MODE BASEMENTAREA_AVG 0.648028
1377 LIVINGAPARTMENTS_AVG BASEMENTAREA_AVG 0.647323
2621 BASEMENTAREA_MEDI LIVINGAPARTMENTS_AVG 0.647128
2937 ENTRANCES_MEDI BASEMENTAREA_AVG 0.645062
3463 TOTALAREA_MODE FLOORSMAX_AVG 0.633005
1448 LIVINGAREA_AVG FLOORSMAX_AVG 0.630354
3491 TOTALAREA_MODE FLOORSMAX_MEDI 0.630352
2631 BASEMENTAREA_MEDI ENTRANCES_MODE 0.629266
3211 LIVINGAPARTMENTS_MEDI BASEMENTAREA_MODE 0.629230
3012 FLOORSMAX_MEDI LIVINGAREA_AVG 0.628339
3268 LIVINGAREA_MEDI FLOORSMAX_AVG 0.626826
3296 LIVINGAREA_MEDI FLOORSMAX_MEDI 0.626045
2102 FLOORSMAX_MODE LIVINGAREA_AVG 0.626040
3477 TOTALAREA_MODE FLOORSMAX_MODE 0.625672
1711 BASEMENTAREA_MODE LIVINGAPARTMENTS_AVG 0.624821
2027 ENTRANCES_MODE BASEMENTAREA_AVG 0.624532
3282 LIVINGAREA_MEDI FLOORSMAX_MODE 0.623942
2371 LIVINGAREA_MODE ENTRANCES_MODE 0.621320
2357 LIVINGAREA_MODE ENTRANCES_AVG 0.621183
2961 ENTRANCES_MEDI LIVINGAREA_MODE 0.620597
1181 FLOORSMAX_AVG APARTMENTS_AVG 0.618626
3267 LIVINGAREA_MEDI ENTRANCES_AVG 0.617373
3295 LIVINGAREA_MEDI ENTRANCES_MEDI 0.617040
1447 LIVINGAREA_AVG ENTRANCES_AVG 0.616662
3001 FLOORSMAX_MEDI APARTMENTS_AVG 0.616419
2553 APARTMENTS_MEDI FLOORSMAX_AVG 0.615086
2091 FLOORSMAX_MODE APARTMENTS_AVG 0.614612
3029 FLOORSMAX_MEDI APARTMENTS_MEDI 0.614135
2040 ENTRANCES_MODE APARTMENTS_MODE 0.614042
2947 ENTRANCES_MEDI LIVINGAREA_AVG 0.612807
2567 APARTMENTS_MEDI FLOORSMAX_MODE 0.612492
2950 ENTRANCES_MEDI APARTMENTS_MODE 0.610753
1642 APARTMENTS_MODE ENTRANCES_AVG 0.610553
2552 APARTMENTS_MEDI ENTRANCES_AVG 0.609686
2964 ENTRANCES_MEDI APARTMENTS_MEDI 0.609468
1116 ENTRANCES_AVG APARTMENTS_AVG 0.609416
2372 LIVINGAREA_MODE FLOORSMAX_MODE 0.605787
2936 ENTRANCES_MEDI APARTMENTS_AVG 0.605650
In [27]:
get_corr_matrix(dataset = pd_loan_train[list_var_continuous], 
                metodo='spearman', size_figure=[10,8])
No description has been provided for this image
Out[27]:
0

También hemos hecho la matriz de spearman. Ambas matrices proporcionan resultados muy parecidos, por lo que llegamos a la conclusión de que es necesario realizar una simplificación de las variables para evitar problemas en el modelado, ya que muchas de nuestras variables no nos están aportando un valor añadido y nos hace el trabajo más complejo.

In [28]:
corr = pd_loan_train[list_var_continuous].corr('spearman')
new_corr = corr.abs()
new_corr.loc[:,:] = np.tril(new_corr, k=-1) 
new_corr = new_corr.stack().to_frame('correlation').reset_index().sort_values(by='correlation', ascending=False)
new_corr[new_corr['correlation']>0.6]
Out[28]:
level_0 level_1 correlation
2744 YEARS_BUILD_MEDI YEARS_BUILD_AVG 0.998416
2678 YEARS_BEGINEXPLUATATION_MEDI YEARS_BEGINEXPLUATATION_AVG 0.997450
3694 OBS_60_CNT_SOCIAL_CIRCLE OBS_30_CNT_SOCIAL_CIRCLE 0.997376
3140 LANDAREA_MEDI LANDAREA_AVG 0.996411
3074 FLOORSMIN_MEDI FLOORSMIN_AVG 0.996286
3206 LIVINGAPARTMENTS_MEDI LIVINGAPARTMENTS_AVG 0.996119
2810 COMMONAREA_MEDI COMMONAREA_AVG 0.995749
3272 LIVINGAREA_MEDI LIVINGAREA_AVG 0.995541
2546 APARTMENTS_MEDI APARTMENTS_AVG 0.995248
3008 FLOORSMAX_MEDI FLOORSMAX_AVG 0.994943
2612 BASEMENTAREA_MEDI BASEMENTAREA_AVG 0.994792
2942 ENTRANCES_MEDI ENTRANCES_AVG 0.993437
2876 ELEVATORS_MEDI ELEVATORS_AVG 0.991171
2758 YEARS_BUILD_MEDI YEARS_BUILD_MODE 0.987994
1834 YEARS_BUILD_MODE YEARS_BUILD_AVG 0.987693
923 YEARS_BUILD_AVG YEARS_BEGINEXPLUATATION_AVG 0.986985
3088 FLOORSMIN_MEDI FLOORSMIN_MODE 0.986387
3022 FLOORSMAX_MEDI FLOORSMAX_MODE 0.986375
2771 YEARS_BUILD_MEDI YEARS_BEGINEXPLUATATION_MEDI 0.986048
2692 YEARS_BEGINEXPLUATATION_MEDI YEARS_BEGINEXPLUATATION_MODE 0.985598
2743 YEARS_BUILD_MEDI YEARS_BEGINEXPLUATATION_AVG 0.985557
1768 YEARS_BEGINEXPLUATATION_MODE YEARS_BEGINEXPLUATATION_AVG 0.985229
196 AMT_GOODS_PRICE AMT_CREDIT 0.984895
2679 YEARS_BEGINEXPLUATATION_MEDI YEARS_BUILD_AVG 0.984651
2164 FLOORSMIN_MODE FLOORSMIN_AVG 0.982697
3338 NONLIVINGAPARTMENTS_MEDI NONLIVINGAPARTMENTS_AVG 0.981757
2098 FLOORSMAX_MODE FLOORSMAX_AVG 0.981643
1847 YEARS_BUILD_MODE YEARS_BEGINEXPLUATATION_MODE 0.980850
3404 NONLIVINGAREA_MEDI NONLIVINGAREA_AVG 0.980823
2890 ELEVATORS_MEDI ELEVATORS_MODE 0.977978
1833 YEARS_BUILD_MODE YEARS_BEGINEXPLUATATION_AVG 0.975123
2693 YEARS_BEGINEXPLUATATION_MEDI YEARS_BUILD_MODE 0.974860
3220 LIVINGAPARTMENTS_MEDI LIVINGAPARTMENTS_MODE 0.973453
1376 LIVINGAPARTMENTS_AVG APARTMENTS_AVG 0.973382
2824 COMMONAREA_MEDI COMMONAREA_MODE 0.971104
3154 LANDAREA_MEDI LANDAREA_MODE 0.970817
3224 LIVINGAPARTMENTS_MEDI APARTMENTS_MEDI 0.970764
2560 APARTMENTS_MEDI APARTMENTS_MODE 0.969877
2757 YEARS_BUILD_MEDI YEARS_BEGINEXPLUATATION_MODE 0.969857
1769 YEARS_BEGINEXPLUATATION_MODE YEARS_BUILD_AVG 0.969559
3196 LIVINGAPARTMENTS_MEDI APARTMENTS_AVG 0.969534
2296 LIVINGAPARTMENTS_MODE LIVINGAPARTMENTS_AVG 0.969473
1966 ELEVATORS_MODE ELEVATORS_AVG 0.969201
3286 LIVINGAREA_MEDI LIVINGAREA_MODE 0.969048
2556 APARTMENTS_MEDI LIVINGAPARTMENTS_AVG 0.967592
2230 LANDAREA_MODE LANDAREA_AVG 0.967388
1900 COMMONAREA_MODE COMMONAREA_AVG 0.965749
2956 ENTRANCES_MEDI ENTRANCES_MODE 0.965027
1636 APARTMENTS_MODE APARTMENTS_AVG 0.964807
2362 LIVINGAREA_MODE LIVINGAREA_AVG 0.964042
2626 BASEMENTAREA_MEDI BASEMENTAREA_MODE 0.963008
1702 BASEMENTAREA_MODE BASEMENTAREA_AVG 0.959573
2032 ENTRANCES_MODE ENTRANCES_AVG 0.957727
2300 LIVINGAPARTMENTS_MODE APARTMENTS_MODE 0.954970
3352 NONLIVINGAPARTMENTS_MEDI NONLIVINGAPARTMENTS_MODE 0.951175
2570 APARTMENTS_MEDI LIVINGAPARTMENTS_MODE 0.946656
2286 LIVINGAPARTMENTS_MODE APARTMENTS_AVG 0.943402
3418 NONLIVINGAREA_MEDI NONLIVINGAREA_MODE 0.941573
3467 TOTALAREA_MODE LIVINGAREA_AVG 0.939764
3495 TOTALAREA_MODE LIVINGAREA_MEDI 0.935193
3210 LIVINGAPARTMENTS_MEDI APARTMENTS_MODE 0.932577
2428 NONLIVINGAPARTMENTS_MODE NONLIVINGAPARTMENTS_AVG 0.931496
1646 APARTMENTS_MODE LIVINGAPARTMENTS_AVG 0.928653
2494 NONLIVINGAREA_MODE NONLIVINGAREA_AVG 0.917995
3481 TOTALAREA_MODE LIVINGAREA_MODE 0.917386
1441 LIVINGAREA_AVG APARTMENTS_AVG 0.906637
3289 LIVINGAREA_MEDI APARTMENTS_MEDI 0.904824
3261 LIVINGAREA_MEDI APARTMENTS_AVG 0.902739
2557 APARTMENTS_MEDI LIVINGAREA_AVG 0.901828
3456 TOTALAREA_MODE APARTMENTS_AVG 0.900361
1451 LIVINGAREA_AVG LIVINGAPARTMENTS_AVG 0.897020
3484 TOTALAREA_MODE APARTMENTS_MEDI 0.895120
2365 LIVINGAREA_MODE APARTMENTS_MODE 0.893883
3299 LIVINGAREA_MEDI LIVINGAPARTMENTS_MEDI 0.893729
3207 LIVINGAPARTMENTS_MEDI LIVINGAREA_AVG 0.893102
3271 LIVINGAREA_MEDI LIVINGAPARTMENTS_AVG 0.891945
2571 APARTMENTS_MEDI LIVINGAREA_MODE 0.878208
3466 TOTALAREA_MODE LIVINGAPARTMENTS_AVG 0.876107
3470 TOTALAREA_MODE APARTMENTS_MODE 0.876080
2375 LIVINGAREA_MODE LIVINGAPARTMENTS_MODE 0.875014
3275 LIVINGAREA_MEDI APARTMENTS_MODE 0.874976
2351 LIVINGAREA_MODE APARTMENTS_AVG 0.874050
3494 TOTALAREA_MODE LIVINGAPARTMENTS_MEDI 0.871992
1647 APARTMENTS_MODE LIVINGAREA_AVG 0.870607
3285 LIVINGAREA_MEDI LIVINGAPARTMENTS_MODE 0.869309
2297 LIVINGAPARTMENTS_MODE LIVINGAREA_AVG 0.866596
3221 LIVINGAPARTMENTS_MEDI LIVINGAREA_MODE 0.858764
2361 LIVINGAREA_MODE LIVINGAPARTMENTS_AVG 0.855415
3480 TOTALAREA_MODE LIVINGAPARTMENTS_MODE 0.853394
1186 FLOORSMAX_AVG ELEVATORS_AVG 0.849515
3034 FLOORSMAX_MEDI ELEVATORS_MEDI 0.845722
3760 DEF_60_CNT_SOCIAL_CIRCLE DEF_30_CNT_SOCIAL_CIRCLE 0.845260
2878 ELEVATORS_MEDI FLOORSMAX_AVG 0.844534
3006 FLOORSMAX_MEDI ELEVATORS_AVG 0.842164
2110 FLOORSMAX_MODE ELEVATORS_MODE 0.833303
3020 FLOORSMAX_MEDI ELEVATORS_MODE 0.830893
131 AMT_ANNUITY AMT_CREDIT 0.830145
1968 ELEVATORS_MODE FLOORSMAX_AVG 0.829703
197 AMT_GOODS_PRICE AMT_ANNUITY 0.828087
2892 ELEVATORS_MEDI FLOORSMAX_MODE 0.826156
2096 FLOORSMAX_MODE ELEVATORS_AVG 0.822290
1448 LIVINGAREA_AVG FLOORSMAX_AVG 0.783432
3012 FLOORSMAX_MEDI LIVINGAREA_AVG 0.782136
3296 LIVINGAREA_MEDI FLOORSMAX_MEDI 0.780036
3463 TOTALAREA_MODE FLOORSMAX_AVG 0.778940
3491 TOTALAREA_MODE FLOORSMAX_MEDI 0.778108
3268 LIVINGAREA_MEDI FLOORSMAX_AVG 0.778077
2102 FLOORSMAX_MODE LIVINGAREA_AVG 0.776473
3477 TOTALAREA_MODE FLOORSMAX_MODE 0.776253
3282 LIVINGAREA_MEDI FLOORSMAX_MODE 0.774893
2372 LIVINGAREA_MODE FLOORSMAX_MODE 0.760936
1181 FLOORSMAX_AVG APARTMENTS_AVG 0.756658
3001 FLOORSMAX_MEDI APARTMENTS_AVG 0.755424
3029 FLOORSMAX_MEDI APARTMENTS_MEDI 0.752576
2091 FLOORSMAX_MODE APARTMENTS_AVG 0.751576
2553 APARTMENTS_MEDI FLOORSMAX_AVG 0.750744
2567 APARTMENTS_MEDI FLOORSMAX_MODE 0.749187
3026 FLOORSMAX_MEDI LIVINGAREA_MODE 0.747704
2358 LIVINGAREA_MODE FLOORSMAX_AVG 0.744613
2105 FLOORSMAX_MODE APARTMENTS_MODE 0.735494
1446 LIVINGAREA_AVG ELEVATORS_AVG 0.728879
1383 LIVINGAPARTMENTS_AVG FLOORSMAX_AVG 0.725581
2882 ELEVATORS_MEDI LIVINGAREA_AVG 0.724450
3011 FLOORSMAX_MEDI LIVINGAPARTMENTS_AVG 0.723351
3294 LIVINGAREA_MEDI ELEVATORS_MEDI 0.721458
3231 LIVINGAPARTMENTS_MEDI FLOORSMAX_MEDI 0.721198
3203 LIVINGAPARTMENTS_MEDI FLOORSMAX_AVG 0.720724
3015 FLOORSMAX_MEDI APARTMENTS_MODE 0.720197
3266 LIVINGAREA_MEDI ELEVATORS_AVG 0.719683
1442 LIVINGAREA_AVG BASEMENTAREA_AVG 0.719157
3461 TOTALAREA_MODE ELEVATORS_AVG 0.718700
2101 FLOORSMAX_MODE LIVINGAPARTMENTS_AVG 0.717937
1643 APARTMENTS_MODE FLOORSMAX_AVG 0.717494
3217 LIVINGAPARTMENTS_MEDI FLOORSMAX_MODE 0.716286
3262 LIVINGAREA_MEDI BASEMENTAREA_AVG 0.716269
3489 TOTALAREA_MODE ELEVATORS_MEDI 0.715511
3290 LIVINGAREA_MEDI BASEMENTAREA_MEDI 0.714780
2622 BASEMENTAREA_MEDI LIVINGAREA_AVG 0.712863
3457 TOTALAREA_MODE BASEMENTAREA_AVG 0.711425
1972 ELEVATORS_MODE LIVINGAREA_AVG 0.710057
3280 LIVINGAREA_MEDI ELEVATORS_MODE 0.708684
3475 TOTALAREA_MODE ELEVATORS_MODE 0.706810
3485 TOTALAREA_MODE BASEMENTAREA_MEDI 0.704676
2366 LIVINGAREA_MODE BASEMENTAREA_MODE 0.702810
2307 LIVINGAPARTMENTS_MODE FLOORSMAX_MODE 0.701770
2507 NONLIVINGAREA_MODE NONLIVINGAPARTMENTS_MODE 0.699190
791 BASEMENTAREA_AVG APARTMENTS_AVG 0.699007
2636 BASEMENTAREA_MEDI LIVINGAREA_MODE 0.697937
1051 ELEVATORS_AVG APARTMENTS_AVG 0.697512
2352 LIVINGAREA_MODE BASEMENTAREA_AVG 0.697120
2547 APARTMENTS_MEDI BASEMENTAREA_AVG 0.696480
2639 BASEMENTAREA_MEDI APARTMENTS_MEDI 0.695627
2871 ELEVATORS_MEDI APARTMENTS_AVG 0.693965
2611 BASEMENTAREA_MEDI APARTMENTS_AVG 0.693159
1377 LIVINGAPARTMENTS_AVG BASEMENTAREA_AVG 0.692516
3025 FLOORSMAX_MEDI LIVINGAPARTMENTS_MODE 0.692421
1381 LIVINGAPARTMENTS_AVG ELEVATORS_AVG 0.692076
3197 LIVINGAPARTMENTS_MEDI BASEMENTAREA_AVG 0.690960
2899 ELEVATORS_MEDI APARTMENTS_MEDI 0.690799
2293 LIVINGAPARTMENTS_MODE FLOORSMAX_AVG 0.690080
3431 NONLIVINGAREA_MEDI NONLIVINGAPARTMENTS_MEDI 0.689458
3225 LIVINGAPARTMENTS_MEDI BASEMENTAREA_MEDI 0.689018
2551 APARTMENTS_MEDI ELEVATORS_AVG 0.688415
2370 LIVINGAREA_MODE ELEVATORS_MODE 0.688238
2881 ELEVATORS_MEDI LIVINGAPARTMENTS_AVG 0.687866
1583 NONLIVINGAREA_AVG NONLIVINGAPARTMENTS_AVG 0.686392
2621 BASEMENTAREA_MEDI LIVINGAPARTMENTS_AVG 0.686389
3229 LIVINGAPARTMENTS_MEDI ELEVATORS_MEDI 0.686381
3201 LIVINGAPARTMENTS_MEDI ELEVATORS_AVG 0.685073
1715 BASEMENTAREA_MODE APARTMENTS_MODE 0.683641
1961 ELEVATORS_MODE APARTMENTS_AVG 0.681941
2565 APARTMENTS_MEDI ELEVATORS_MODE 0.680355
3403 NONLIVINGAREA_MEDI NONLIVINGAPARTMENTS_AVG 0.677969
2625 BASEMENTAREA_MEDI APARTMENTS_MODE 0.677787
2287 LIVINGAPARTMENTS_MODE BASEMENTAREA_AVG 0.677529
2635 BASEMENTAREA_MEDI LIVINGAPARTMENTS_MODE 0.677384
1637 APARTMENTS_MODE BASEMENTAREA_AVG 0.677007
3276 LIVINGAREA_MEDI BASEMENTAREA_MODE 0.676060
2896 ELEVATORS_MEDI LIVINGAREA_MODE 0.674796
1971 ELEVATORS_MODE LIVINGAPARTMENTS_AVG 0.674192
3215 LIVINGAPARTMENTS_MEDI ELEVATORS_MODE 0.673663
2301 LIVINGAPARTMENTS_MODE BASEMENTAREA_MODE 0.673447
3339 NONLIVINGAPARTMENTS_MEDI NONLIVINGAREA_AVG 0.672388
3471 TOTALAREA_MODE BASEMENTAREA_MODE 0.671685
1712 BASEMENTAREA_MODE LIVINGAREA_AVG 0.670638
2356 LIVINGAREA_MODE ELEVATORS_AVG 0.668383
1253 FLOORSMIN_AVG FLOORSMAX_AVG 0.664174
1975 ELEVATORS_MODE APARTMENTS_MODE 0.662330
3353 NONLIVINGAPARTMENTS_MEDI NONLIVINGAREA_MODE 0.660924
3009 FLOORSMAX_MEDI FLOORSMIN_AVG 0.660834
3101 FLOORSMIN_MEDI FLOORSMAX_MEDI 0.660306
1117 ENTRANCES_AVG BASEMENTAREA_AVG 0.659750
2561 APARTMENTS_MEDI BASEMENTAREA_MODE 0.659651
2617 BASEMENTAREA_MEDI ENTRANCES_AVG 0.659611
2305 LIVINGAPARTMENTS_MODE ELEVATORS_MODE 0.658769
3073 FLOORSMIN_MEDI FLOORSMAX_AVG 0.658606
3417 NONLIVINGAREA_MEDI NONLIVINGAPARTMENTS_MODE 0.658102
2965 ENTRANCES_MEDI BASEMENTAREA_MEDI 0.658026
1701 BASEMENTAREA_MODE APARTMENTS_AVG 0.653711
2937 ENTRANCES_MEDI BASEMENTAREA_AVG 0.652067
2099 FLOORSMAX_MODE FLOORSMIN_AVG 0.649632
3087 FLOORSMIN_MEDI FLOORSMAX_MODE 0.649325
2895 ELEVATORS_MEDI LIVINGAPARTMENTS_MODE 0.649242
3211 LIVINGAPARTMENTS_MEDI BASEMENTAREA_MODE 0.647185
2041 ENTRANCES_MODE BASEMENTAREA_MODE 0.646627
2885 ELEVATORS_MEDI APARTMENTS_MODE 0.645129
2291 LIVINGAPARTMENTS_MODE ELEVATORS_AVG 0.644700
2493 NONLIVINGAREA_MODE NONLIVINGAPARTMENTS_AVG 0.643280
2951 ENTRANCES_MEDI BASEMENTAREA_MODE 0.642491
1711 BASEMENTAREA_MODE LIVINGAPARTMENTS_AVG 0.642454
2177 FLOORSMIN_MODE FLOORSMAX_MODE 0.642128
1707 BASEMENTAREA_MODE ENTRANCES_AVG 0.641863
1641 APARTMENTS_MODE ELEVATORS_AVG 0.639971
3023 FLOORSMAX_MEDI FLOORSMIN_MODE 0.638397
2429 NONLIVINGAPARTMENTS_MODE NONLIVINGAREA_AVG 0.636192
2163 FLOORSMIN_MODE FLOORSMAX_AVG 0.636170
3460 TOTALAREA_MODE COMMONAREA_AVG 0.622166
2631 BASEMENTAREA_MEDI ENTRANCES_MODE 0.621522
3488 TOTALAREA_MODE COMMONAREA_MEDI 0.618277
2027 ENTRANCES_MODE BASEMENTAREA_AVG 0.614280
1447 LIVINGAREA_AVG ENTRANCES_AVG 0.609149
3267 LIVINGAREA_MEDI ENTRANCES_AVG 0.608763
3295 LIVINGAREA_MEDI ENTRANCES_MEDI 0.605690
2357 LIVINGAREA_MODE ENTRANCES_AVG 0.602625
2961 ENTRANCES_MEDI LIVINGAREA_MODE 0.601853
2947 ENTRANCES_MEDI LIVINGAREA_AVG 0.600747

TRATAMIENTO DE LOS VALORES NULOS

In [29]:
def get_percent_null_values_target(pd_loan, list_var_continuous, target):

    pd_final = pd.DataFrame()
    for i in list_var_continuous:
        if pd_loan[i].isnull().sum() > 0:
            pd_concat_percent = pd.DataFrame(pd_loan[target][pd_loan[i].isnull()]
                                             .value_counts(normalize=True).reset_index()).T

            # Verifica cuántas columnas tiene pd_concat_percent antes de asignarlas
            if pd_concat_percent.shape[1] >= 2:
                pd_concat_percent.columns = [pd_concat_percent.iloc[0, 0],
                                             pd_concat_percent.iloc[0, 1]]
            elif pd_concat_percent.shape[1] == 1:
                pd_concat_percent.columns = [pd_concat_percent.iloc[0, 0]]

            pd_concat_percent = pd_concat_percent.drop(target, axis=0)
            pd_concat_percent['variable'] = i
            pd_concat_percent['sum_null_values'] = pd_loan[i].isnull().sum()
            pd_concat_percent['porcentaje_sum_null_values'] = pd_loan[i].isnull().sum() / pd_loan.shape[0]
            pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)

    if pd_final.empty:
        print('No existen variables con valores nulos')

    return pd_final
In [30]:
get_percent_null_values_target(pd_loan_train, list_var_continuous, target='TARGET')
Out[30]:
0.0 variable sum_null_values porcentaje_sum_null_values 1.0
0 1.000000 AMT_ANNUITY 9 0.000037 NaN
1 0.921397 AMT_GOODS_PRICE 229 0.000931 0.078603
2 0.914689 OWN_CAR_AGE 162464 0.660401 0.085311
3 1.000000 CNT_FAM_MEMBERS 2 0.000008 NaN
4 0.914966 EXT_SOURCE_1 138803 0.564221 0.085034
5 0.915709 EXT_SOURCE_2 522 0.002122 0.084291
6 0.907393 EXT_SOURCE_3 48787 0.198315 0.092607
7 0.908305 APARTMENTS_AVG 124849 0.507500 0.091695
8 0.910521 BASEMENTAREA_AVG 144000 0.585347 0.089479
9 0.907739 YEARS_BEGINEXPLUATATION_AVG 119997 0.487777 0.092261
10 0.913040 YEARS_BUILD_AVG 163592 0.664987 0.086960
11 0.914218 COMMONAREA_AVG 171913 0.698811 0.085782
12 0.908684 ELEVATORS_AVG 131073 0.532800 0.091316
13 0.907944 ENTRANCES_AVG 123816 0.503301 0.092056
14 0.907816 FLOORSMAX_AVG 122429 0.497663 0.092184
15 0.913647 FLOORSMIN_AVG 166943 0.678608 0.086353
16 0.911610 LANDAREA_AVG 146113 0.593936 0.088390
17 0.913778 LIVINGAPARTMENTS_AVG 168206 0.683742 0.086222
18 0.908343 LIVINGAREA_AVG 123471 0.501898 0.091657
19 0.914138 NONLIVINGAPARTMENTS_AVG 170833 0.694421 0.085862
20 0.909499 NONLIVINGAREA_AVG 135755 0.551832 0.090501
21 0.908305 APARTMENTS_MODE 124849 0.507500 0.091695
22 0.910521 BASEMENTAREA_MODE 144000 0.585347 0.089479
23 0.907739 YEARS_BEGINEXPLUATATION_MODE 119997 0.487777 0.092261
24 0.913040 YEARS_BUILD_MODE 163592 0.664987 0.086960
25 0.914218 COMMONAREA_MODE 171913 0.698811 0.085782
26 0.908684 ELEVATORS_MODE 131073 0.532800 0.091316
27 0.907944 ENTRANCES_MODE 123816 0.503301 0.092056
28 0.907816 FLOORSMAX_MODE 122429 0.497663 0.092184
29 0.913647 FLOORSMIN_MODE 166943 0.678608 0.086353
30 0.911610 LANDAREA_MODE 146113 0.593936 0.088390
31 0.913778 LIVINGAPARTMENTS_MODE 168206 0.683742 0.086222
32 0.908343 LIVINGAREA_MODE 123471 0.501898 0.091657
33 0.914138 NONLIVINGAPARTMENTS_MODE 170833 0.694421 0.085862
34 0.909499 NONLIVINGAREA_MODE 135755 0.551832 0.090501
35 0.908305 APARTMENTS_MEDI 124849 0.507500 0.091695
36 0.910521 BASEMENTAREA_MEDI 144000 0.585347 0.089479
37 0.907739 YEARS_BEGINEXPLUATATION_MEDI 119997 0.487777 0.092261
38 0.913040 YEARS_BUILD_MEDI 163592 0.664987 0.086960
39 0.914218 COMMONAREA_MEDI 171913 0.698811 0.085782
40 0.908684 ELEVATORS_MEDI 131073 0.532800 0.091316
41 0.907944 ENTRANCES_MEDI 123816 0.503301 0.092056
42 0.907816 FLOORSMAX_MEDI 122429 0.497663 0.092184
43 0.913647 FLOORSMIN_MEDI 166943 0.678608 0.086353
44 0.911610 LANDAREA_MEDI 146113 0.593936 0.088390
45 0.913778 LIVINGAPARTMENTS_MEDI 168206 0.683742 0.086222
46 0.908343 LIVINGAREA_MEDI 123471 0.501898 0.091657
47 0.914138 NONLIVINGAPARTMENTS_MEDI 170833 0.694421 0.085862
48 0.909499 NONLIVINGAREA_MEDI 135755 0.551832 0.090501
49 0.907445 TOTALAREA_MODE 118730 0.482627 0.092555
50 0.965599 OBS_30_CNT_SOCIAL_CIRCLE 843 0.003427 0.034401
51 0.965599 DEF_30_CNT_SOCIAL_CIRCLE 843 0.003427 0.034401
52 0.965599 OBS_60_CNT_SOCIAL_CIRCLE 843 0.003427 0.034401
53 0.965599 DEF_60_CNT_SOCIAL_CIRCLE 843 0.003427 0.034401
54 1.000000 DAYS_LAST_PHONE_CHANGE 1 0.000004 NaN
55 0.897417 AMT_REQ_CREDIT_BUREAU_HOUR 33251 0.135162 0.102583
56 0.897417 AMT_REQ_CREDIT_BUREAU_DAY 33251 0.135162 0.102583
57 0.897417 AMT_REQ_CREDIT_BUREAU_WEEK 33251 0.135162 0.102583
58 0.897417 AMT_REQ_CREDIT_BUREAU_MON 33251 0.135162 0.102583
59 0.897417 AMT_REQ_CREDIT_BUREAU_QRT 33251 0.135162 0.102583
60 0.897417 AMT_REQ_CREDIT_BUREAU_YEAR 33251 0.135162 0.102583

Vamos a cambiar los valores nulos de las columnas continuas por la mediana de los valores no nulos de la columna, se ha elegido la mediana ya que a diferencia de la media (promedio), la mediana no se ve afectada por valores atípicos o extremos en los datos. Al tener un dataset con variables con valores muy extremos (como se puede observar en los gráficos) la mediana es una mejor medida de tendencia central porque representa el punto medio de la distribución.

In [44]:
medianas_train = pd_loan_train[list_var_continuous].median()
pd_loan_train[list_var_continuous] = pd_loan_train[list_var_continuous].fillna(medianas_train)
pd_loan_test[list_var_continuous] = pd_loan_test[list_var_continuous].fillna(medianas_train)

Comprobamos que n existen valores nulos en las columnas

In [47]:
get_percent_null_values_target(pd_loan_train, list_var_continuous, target='TARGET')
No existen variables con valores nulos
Out[47]:

TRATAMIENTO VARIABLES CATEGORICAS

In [48]:
list_var_cat
Out[48]:
['NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'NAME_TYPE_SUITE',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'OCCUPATION_TYPE',
 'WEEKDAY_APPR_PROCESS_START',
 'ORGANIZATION_TYPE',
 'FONDKAPREMONT_MODE',
 'HOUSETYPE_MODE',
 'WALLSMATERIAL_MODE',
 'EMERGENCYSTATE_MODE']
In [50]:
from scipy.stats import chi2_contingency
def cramers_v(matrix):
    chi2, p, dof, ex = chi2_contingency(matrix)  # Chi-squared test
    return np.sqrt(chi2 / (matrix.sum().sum() * (min(matrix.shape) - 1)))

cramers_v_results = {}

for col in list_var_cat:
    # Calculamos la matriz de confusión entre las variable categórica y TARGET
    confusion_matrix = pd.crosstab(pd_loan_train["TARGET"], pd_loan_train[col])
    
    # Calculamos el Cramér's V para esta matriz
    cramers_v_value = cramers_v(confusion_matrix.values)
    
    # Guardamos el resultado en el diccionario
    cramers_v_results[col] = cramers_v_value

cramers_v_df = pd.DataFrame(list(cramers_v_results.items()), columns=['Variable', 'Cramér\'s V'])
cramers_v_df = cramers_v_df.sort_values(by='Cramér\'s V', ascending=False)

print(cramers_v_df)
                      Variable  Cramér's V
9              OCCUPATION_TYPE    0.078551
11           ORGANIZATION_TYPE    0.072094
5             NAME_INCOME_TYPE    0.063683
6          NAME_EDUCATION_TYPE    0.057060
1                  CODE_GENDER    0.055116
14          WALLSMATERIAL_MODE    0.045075
15         EMERGENCYSTATE_MODE    0.043118
13              HOUSETYPE_MODE    0.041642
7           NAME_FAMILY_STATUS    0.040930
8            NAME_HOUSING_TYPE    0.038018
0           NAME_CONTRACT_TYPE    0.031849
12          FONDKAPREMONT_MODE    0.031243
2                 FLAG_OWN_CAR    0.023452
4              NAME_TYPE_SUITE    0.012199
10  WEEKDAY_APPR_PROCESS_START    0.006439
3              FLAG_OWN_REALTY    0.006397

OPCIÓN 2

In [51]:
confusion_matrix = pd.crosstab(pd_loan_train["TARGET"], pd_loan_train["TARGET"])
print(confusion_matrix)
cramers_v(confusion_matrix.values)
TARGET       0      1
TARGET               
0       226148      0
1            0  19860
Out[51]:
0.9999726128248573
In [52]:
confusion_matrix = pd.crosstab(pd_loan_train["TARGET"], pd_loan_train["TARGET"])
cramers_v(confusion_matrix.values)
Out[52]:
0.9999726128248573
In [53]:
confusion_matrix = pd.crosstab(pd_loan_train["TARGET"], pd_loan_train["OWN_CAR_AGE"])
cramers_v(confusion_matrix.values)
Out[53]:
0.04159556349977331
In [54]:
pd_loan_train[list_var_cat] = pd_loan_train[list_var_cat].astype("object").fillna("SIN VALOR").astype("category")
pd_loan_test[list_var_cat] = pd_loan_test[list_var_cat].astype("object").fillna("SIN VALOR").astype("category")

print(pd_loan_test)
           NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY  \
SK_ID_CURR                                                               
344817             Cash loans           F            N               Y   
304895             Cash loans           M            N               Y   
274533             Cash loans           F            N               Y   
416053             Cash loans           M            Y               Y   
328656             Cash loans           F            N               Y   
...                       ...         ...          ...             ...   
314339             Cash loans           F            N               Y   
107600             Cash loans           F            N               Y   
342675             Cash loans           F            Y               N   
109967             Cash loans           M            N               Y   
326396             Cash loans           F            N               N   

            CNT_CHILDREN  AMT_INCOME_TOTAL  AMT_CREDIT  AMT_ANNUITY  \
SK_ID_CURR                                                            
344817                 0          157500.0    584766.0      23319.0   
304895                 1          157500.0    781920.0      57028.5   
274533                 0          112500.0    652500.0      30366.0   
416053                 0          135000.0    823500.0      29304.0   
328656                 0          180000.0    675000.0      20668.5   
...                  ...               ...         ...          ...   
314339                 2          112500.0    696150.0      33619.5   
107600                 1          189000.0    244512.0      16047.0   
342675                 0           67500.0    900000.0      26446.5   
109967                 0           76500.0    247500.0      12168.0   
326396                 0          180000.0    225000.0      12204.0   

            AMT_GOODS_PRICE NAME_TYPE_SUITE      NAME_INCOME_TYPE  \
SK_ID_CURR                                                          
344817             472500.0   Unaccompanied             Pensioner   
304895             675000.0   Unaccompanied               Working   
274533             652500.0          Family             Pensioner   
416053             823500.0   Unaccompanied               Working   
328656             675000.0   Unaccompanied         State servant   
...                     ...             ...                   ...   
314339             562500.0   Unaccompanied  Commercial associate   
107600             216000.0          Family  Commercial associate   
342675             900000.0   Unaccompanied             Pensioner   
109967             247500.0   Unaccompanied  Commercial associate   
326396             225000.0   Unaccompanied               Working   

                      NAME_EDUCATION_TYPE    NAME_FAMILY_STATUS  \
SK_ID_CURR                                                        
344817      Secondary / secondary special               Married   
304895      Secondary / secondary special               Married   
274533      Secondary / secondary special  Single / not married   
416053                  Incomplete higher        Civil marriage   
328656      Secondary / secondary special  Single / not married   
...                                   ...                   ...   
314339      Secondary / secondary special        Civil marriage   
107600      Secondary / secondary special               Married   
342675      Secondary / secondary special               Married   
109967      Secondary / secondary special               Married   
326396                  Incomplete higher  Single / not married   

            NAME_HOUSING_TYPE  REGION_POPULATION_RELATIVE  DAYS_BIRTH  \
SK_ID_CURR                                                              
344817      House / apartment                    0.020713      -21399   
304895           With parents                    0.018029       -8704   
274533      House / apartment                    0.010500      -21103   
416053           With parents                    0.030755      -11442   
328656      House / apartment                    0.014520      -18708   
...                       ...                         ...         ...   
314339      House / apartment                    0.035792      -13185   
107600      House / apartment                    0.035792      -12913   
342675      House / apartment                    0.006305      -20172   
109967      House / apartment                    0.025164      -18737   
326396      House / apartment                    0.026392       -9988   

            DAYS_EMPLOYED  DAYS_REGISTRATION  DAYS_ID_PUBLISH  OWN_CAR_AGE  \
SK_ID_CURR                                                                   
344817             365243             -270.0            -3821          9.0   
304895               -278            -8445.0            -1148          9.0   
274533             365243           -12482.0            -4655          9.0   
416053               -622            -6053.0            -4076         24.0   
328656              -4385            -4404.0            -2266          9.0   
...                   ...                ...              ...          ...   
314339              -2072            -1959.0            -2330          9.0   
107600              -2315            -3532.0            -4541          9.0   
342675             365243            -6258.0            -3388         13.0   
109967               -711            -5999.0            -2261          9.0   
326396               -696            -4241.0            -2404          9.0   

            FLAG_MOBIL  FLAG_EMP_PHONE  FLAG_WORK_PHONE  FLAG_CONT_MOBILE  \
SK_ID_CURR                                                                  
344817               1               0                0                 1   
304895               1               1                0                 1   
274533               1               0                0                 1   
416053               1               1                0                 1   
328656               1               1                0                 1   
...                ...             ...              ...               ...   
314339               1               1                0                 1   
107600               1               1                0                 1   
342675               1               0                0                 1   
109967               1               1                1                 1   
326396               1               1                0                 1   

            FLAG_PHONE  FLAG_EMAIL OCCUPATION_TYPE  CNT_FAM_MEMBERS  \
SK_ID_CURR                                                            
344817               0           0       SIN VALOR              2.0   
304895               0           0        IT staff              3.0   
274533               0           0       SIN VALOR              1.0   
416053               0           0      Core staff              2.0   
328656               0           0        Laborers              1.0   
...                ...         ...             ...              ...   
314339               0           0       SIN VALOR              4.0   
107600               1           0       SIN VALOR              3.0   
342675               0           0       SIN VALOR              2.0   
109967               0           0        Laborers              2.0   
326396               1           1      Core staff              1.0   

            REGION_RATING_CLIENT  REGION_RATING_CLIENT_W_CITY  \
SK_ID_CURR                                                      
344817                         3                            2   
304895                         3                            2   
274533                         3                            3   
416053                         2                            2   
328656                         2                            2   
...                          ...                          ...   
314339                         2                            2   
107600                         2                            2   
342675                         3                            3   
109967                         2                            2   
326396                         2                            2   

           WEEKDAY_APPR_PROCESS_START  HOUR_APPR_PROCESS_START  \
SK_ID_CURR                                                       
344817                        TUESDAY                        8   
304895                         FRIDAY                       11   
274533                        TUESDAY                       12   
416053                       THURSDAY                       14   
328656                      WEDNESDAY                        8   
...                               ...                      ...   
314339                         MONDAY                       14   
107600                        TUESDAY                       11   
342675                        TUESDAY                       12   
109967                      WEDNESDAY                        9   
326396                         FRIDAY                       16   

            REG_REGION_NOT_LIVE_REGION  REG_REGION_NOT_WORK_REGION  \
SK_ID_CURR                                                           
344817                               0                           0   
304895                               0                           0   
274533                               0                           0   
416053                               0                           0   
328656                               0                           0   
...                                ...                         ...   
314339                               0                           0   
107600                               0                           1   
342675                               0                           0   
109967                               0                           0   
326396                               0                           0   

            LIVE_REGION_NOT_WORK_REGION  REG_CITY_NOT_LIVE_CITY  \
SK_ID_CURR                                                        
344817                                0                       0   
304895                                0                       0   
274533                                0                       0   
416053                                0                       0   
328656                                0                       0   
...                                 ...                     ...   
314339                                0                       1   
107600                                1                       0   
342675                                0                       0   
109967                                0                       0   
326396                                0                       0   

            REG_CITY_NOT_WORK_CITY  LIVE_CITY_NOT_WORK_CITY  \
SK_ID_CURR                                                    
344817                           0                        0   
304895                           0                        0   
274533                           0                        0   
416053                           0                        0   
328656                           0                        0   
...                            ...                      ...   
314339                           1                        0   
107600                           1                        1   
342675                           0                        0   
109967                           0                        0   
326396                           0                        0   

                 ORGANIZATION_TYPE  EXT_SOURCE_1  EXT_SOURCE_2  EXT_SOURCE_3  \
SK_ID_CURR                                                                     
344817                         XNA      0.815456      0.659799      0.633032   
304895      Business Entity Type 3      0.307349      0.459206      0.535276   
274533                         XNA      0.773641      0.391290      0.584990   
416053           Transport: type 2      0.381918      0.614966      0.137654   
328656                      School      0.506199      0.344191      0.535276   
...                            ...           ...           ...           ...   
314339               Self-employed      0.357698      0.597733      0.384207   
107600                  Government      0.506199      0.293956      0.675413   
342675                         XNA      0.507980      0.638326      0.520898   
109967      Business Entity Type 3      0.506199      0.381001      0.195198   
326396                  Restaurant      0.332320      0.606654      0.344155   

            APARTMENTS_AVG  BASEMENTAREA_AVG  YEARS_BEGINEXPLUATATION_AVG  \
SK_ID_CURR                                                                  
344817              0.2165            0.1626                       0.9876   
304895              0.1124            0.0408                       0.9871   
274533              0.0876            0.0764                       0.9816   
416053              0.0876            0.0764                       0.9816   
328656              0.0732            0.0764                       0.9876   
...                    ...               ...                          ...   
314339              0.0876            0.0764                       0.9816   
107600              0.0825            0.0793                       0.9737   
342675              0.1237            0.1208                       0.9876   
109967              0.0613            0.0405                       0.9702   
326396              0.0825            0.0814                       0.9752   

            YEARS_BUILD_AVG  COMMONAREA_AVG  ELEVATORS_AVG  ENTRANCES_AVG  \
SK_ID_CURR                                                                  
344817               0.8300          0.0553           0.24         0.2069   
304895               0.8232          0.0398           0.16         0.1379   
274533               0.7552          0.0211           0.00         0.1379   
416053               0.7552          0.0211           0.00         0.1379   
328656               0.7552          0.0211           0.00         0.0345   
...                     ...             ...            ...            ...   
314339               0.7552          0.0211           0.00         0.1379   
107600               0.6396          0.0076           0.00         0.1379   
342675               0.8300          0.0204           0.00         0.2759   
109967               0.5920          0.0271           0.00         0.1552   
326396               0.7552          0.0211           0.00         0.1379   

            FLOORSMAX_AVG  FLOORSMIN_AVG  LANDAREA_AVG  LIVINGAPARTMENTS_AVG  \
SK_ID_CURR                                                                     
344817             0.3750         0.4167        0.0283                0.1765   
304895             0.3750         0.3333        0.1164                0.0866   
274533             0.1667         0.2083        0.0484                0.0756   
416053             0.1667         0.2083        0.0484                0.0756   
328656             0.1667         0.2083        0.0484                0.0756   
...                   ...            ...           ...                   ...   
314339             0.1667         0.2083        0.0484                0.0756   
107600             0.1667         0.2083        0.0687                0.0672   
342675             0.1667         0.2083        0.0305                0.0992   
109967             0.1250         0.1667        0.0762                0.0458   
326396             0.1667         0.2083        0.0484                0.0756   

            LIVINGAREA_AVG  NONLIVINGAPARTMENTS_AVG  NONLIVINGAREA_AVG  \
SK_ID_CURR                                                               
344817              0.2588                   0.0000             0.0000   
304895              0.1333                   0.0232             0.1594   
274533              0.0746                   0.0000             0.0036   
416053              0.0746                   0.0000             0.0036   
328656              0.0746                   0.0000             0.0036   
...                    ...                      ...                ...   
314339              0.0746                   0.0000             0.0036   
107600              0.0704                   0.0000             0.0000   
342675              0.1211                   0.0077             0.0573   
109967              0.0650                   0.0193             0.0458   
326396              0.0725                   0.0000             0.0000   

            APARTMENTS_MODE  BASEMENTAREA_MODE  YEARS_BEGINEXPLUATATION_MODE  \
SK_ID_CURR                                                                     
344817               0.2206             0.1688                        0.9876   
304895               0.1145             0.0424                        0.9871   
274533               0.0840             0.0747                        0.9816   
416053               0.0840             0.0747                        0.9816   
328656               0.0746             0.0747                        0.9876   
...                     ...                ...                           ...   
314339               0.0840             0.0747                        0.9816   
107600               0.0840             0.0823                        0.9737   
342675               0.1261             0.1254                        0.9876   
109967               0.0252             0.0190                        0.9692   
326396               0.0840             0.0845                        0.9752   

            YEARS_BUILD_MODE  COMMONAREA_MODE  ELEVATORS_MODE  ENTRANCES_MODE  \
SK_ID_CURR                                                                      
344817                0.8367           0.0558          0.2417          0.2069   
304895                0.8301           0.0401          0.1611          0.1379   
274533                0.7648           0.0190          0.0000          0.1379   
416053                0.7648           0.0190          0.0000          0.1379   
328656                0.7648           0.0190          0.0000          0.0345   
...                      ...              ...             ...             ...   
314339                0.7648           0.0190          0.0000          0.1379   
107600                0.6537           0.0077          0.0000          0.1379   
342675                0.8367           0.0205          0.0000          0.2759   
109967                0.5949           0.0110          0.0000          0.0690   
326396                0.7648           0.0190          0.0000          0.1379   

            FLOORSMAX_MODE  FLOORSMIN_MODE  LANDAREA_MODE  \
SK_ID_CURR                                                  
344817              0.3750          0.4167         0.0290   
304895              0.3750          0.3333         0.1191   
274533              0.1667          0.2083         0.0460   
416053              0.1667          0.2083         0.0460   
328656              0.1667          0.2083         0.0460   
...                    ...             ...            ...   
314339              0.1667          0.2083         0.0460   
107600              0.1667          0.2083         0.0703   
342675              0.1667          0.2083         0.0312   
109967              0.0833          0.1250         0.0774   
326396              0.1667          0.2083         0.0460   

            LIVINGAPARTMENTS_MODE  LIVINGAREA_MODE  NONLIVINGAPARTMENTS_MODE  \
SK_ID_CURR                                                                     
344817                     0.1928           0.2697                    0.0000   
304895                     0.0946           0.1389                    0.0233   
274533                     0.0771           0.0731                    0.0000   
416053                     0.0771           0.0731                    0.0000   
328656                     0.0771           0.0731                    0.0000   
...                           ...              ...                       ...   
314339                     0.0771           0.0731                    0.0000   
107600                     0.0735           0.0734                    0.0000   
342675                     0.1084           0.1262                    0.0078   
109967                     0.0220           0.0247                    0.0000   
326396                     0.0771           0.0756                    0.0000   

            NONLIVINGAREA_MODE  APARTMENTS_MEDI  BASEMENTAREA_MEDI  \
SK_ID_CURR                                                           
344817                  0.0000           0.2186             0.1626   
304895                  0.1688           0.1135             0.0408   
274533                  0.0011           0.0874             0.0760   
416053                  0.0011           0.0874             0.0760   
328656                  0.0011           0.0739             0.0760   
...                        ...              ...                ...   
314339                  0.0011           0.0874             0.0760   
107600                  0.0000           0.0833             0.0793   
342675                  0.0606           0.1249             0.1208   
109967                  0.0000           0.0619             0.0405   
326396                  0.0000           0.0833             0.0814   

            YEARS_BEGINEXPLUATATION_MEDI  YEARS_BUILD_MEDI  COMMONAREA_MEDI  \
SK_ID_CURR                                                                    
344817                            0.9876            0.8323           0.0557   
304895                            0.9871            0.8256           0.0400   
274533                            0.9816            0.7585           0.0209   
416053                            0.9816            0.7585           0.0209   
328656                            0.9876            0.7585           0.0209   
...                                  ...               ...              ...   
314339                            0.9816            0.7585           0.0209   
107600                            0.9737            0.6444           0.0077   
342675                            0.9876            0.8323           0.0205   
109967                            0.9702            0.5975           0.0273   
326396                            0.9752            0.7585           0.0209   

            ELEVATORS_MEDI  ENTRANCES_MEDI  FLOORSMAX_MEDI  FLOORSMIN_MEDI  \
SK_ID_CURR                                                                   
344817                0.24          0.2069          0.3750          0.4167   
304895                0.16          0.1379          0.3750          0.3333   
274533                0.00          0.1379          0.1667          0.2083   
416053                0.00          0.1379          0.1667          0.2083   
328656                0.00          0.0345          0.1667          0.2083   
...                    ...             ...             ...             ...   
314339                0.00          0.1379          0.1667          0.2083   
107600                0.00          0.1379          0.1667          0.2083   
342675                0.00          0.2759          0.1667          0.2083   
109967                0.00          0.1552          0.1250          0.1667   
326396                0.00          0.1379          0.1667          0.2083   

            LANDAREA_MEDI  LIVINGAPARTMENTS_MEDI  LIVINGAREA_MEDI  \
SK_ID_CURR                                                          
344817             0.0288                 0.1796           0.2635   
304895             0.1185                 0.0881           0.1357   
274533             0.0489                 0.0765           0.0750   
416053             0.0489                 0.0765           0.0750   
328656             0.0489                 0.0765           0.0750   
...                   ...                    ...              ...   
314339             0.0489                 0.0765           0.0750   
107600             0.0699                 0.0684           0.0717   
342675             0.0310                 0.1009           0.1233   
109967             0.0776                 0.0466           0.0662   
326396             0.0489                 0.0765           0.0738   

            NONLIVINGAPARTMENTS_MEDI  NONLIVINGAREA_MEDI FONDKAPREMONT_MODE  \
SK_ID_CURR                                                                    
344817                        0.0000              0.0000   reg oper account   
304895                        0.0233              0.1628   reg oper account   
274533                        0.0000              0.0031          SIN VALOR   
416053                        0.0000              0.0031          SIN VALOR   
328656                        0.0000              0.0031          SIN VALOR   
...                              ...                 ...                ...   
314339                        0.0000              0.0031          SIN VALOR   
107600                        0.0000              0.0000   reg oper account   
342675                        0.0078              0.0585   reg oper account   
109967                        0.0194              0.0467   reg oper account   
326396                        0.0000              0.0000          SIN VALOR   

            HOUSETYPE_MODE  TOTALAREA_MODE WALLSMATERIAL_MODE  \
SK_ID_CURR                                                      
344817      block of flats          0.2338              Panel   
304895      block of flats          0.1613              Panel   
274533           SIN VALOR          0.0688          SIN VALOR   
416053           SIN VALOR          0.0688          SIN VALOR   
328656           SIN VALOR          0.0483          SIN VALOR   
...                    ...             ...                ...   
314339           SIN VALOR          0.0688          SIN VALOR   
107600      block of flats          0.0554              Panel   
342675      block of flats          0.1077              Panel   
109967      block of flats          0.0246       Stone, brick   
326396      block of flats          0.0570              Panel   

           EMERGENCYSTATE_MODE  OBS_30_CNT_SOCIAL_CIRCLE  \
SK_ID_CURR                                                 
344817                      No                       0.0   
304895                      No                       1.0   
274533               SIN VALOR                       0.0   
416053               SIN VALOR                       0.0   
328656                      No                       1.0   
...                        ...                       ...   
314339               SIN VALOR                       6.0   
107600                      No                       4.0   
342675                      No                       0.0   
109967                      No                       1.0   
326396                      No                       0.0   

            DEF_30_CNT_SOCIAL_CIRCLE  OBS_60_CNT_SOCIAL_CIRCLE  \
SK_ID_CURR                                                       
344817                           0.0                       0.0   
304895                           1.0                       1.0   
274533                           0.0                       0.0   
416053                           0.0                       0.0   
328656                           0.0                       1.0   
...                              ...                       ...   
314339                           0.0                       6.0   
107600                           0.0                       4.0   
342675                           0.0                       0.0   
109967                           0.0                       1.0   
326396                           0.0                       0.0   

            DEF_60_CNT_SOCIAL_CIRCLE  DAYS_LAST_PHONE_CHANGE  FLAG_DOCUMENT_2  \
SK_ID_CURR                                                                      
344817                           0.0                 -1068.0                0   
304895                           1.0                  -805.0                0   
274533                           0.0                     0.0                0   
416053                           0.0                  -860.0                0   
328656                           0.0                     0.0                0   
...                              ...                     ...              ...   
314339                           0.0                     0.0                0   
107600                           0.0                     0.0                0   
342675                           0.0                  -456.0                0   
109967                           0.0                  -656.0                0   
326396                           0.0                  -328.0                0   

            FLAG_DOCUMENT_3  FLAG_DOCUMENT_4  FLAG_DOCUMENT_5  \
SK_ID_CURR                                                      
344817                    1                0                0   
304895                    1                0                0   
274533                    0                0                0   
416053                    1                0                0   
328656                    1                0                0   
...                     ...              ...              ...   
314339                    1                0                0   
107600                    1                0                0   
342675                    0                0                0   
109967                    1                0                0   
326396                    1                0                0   

            FLAG_DOCUMENT_6  FLAG_DOCUMENT_7  FLAG_DOCUMENT_8  \
SK_ID_CURR                                                      
344817                    0                0                0   
304895                    0                0                0   
274533                    1                0                0   
416053                    0                0                0   
328656                    0                0                0   
...                     ...              ...              ...   
314339                    0                0                0   
107600                    0                0                0   
342675                    1                0                0   
109967                    0                0                0   
326396                    0                0                0   

            FLAG_DOCUMENT_9  FLAG_DOCUMENT_10  FLAG_DOCUMENT_11  \
SK_ID_CURR                                                        
344817                    0                 0                 0   
304895                    0                 0                 0   
274533                    0                 0                 0   
416053                    0                 0                 0   
328656                    0                 0                 0   
...                     ...               ...               ...   
314339                    0                 0                 0   
107600                    0                 0                 0   
342675                    0                 0                 0   
109967                    0                 0                 0   
326396                    0                 0                 0   

            FLAG_DOCUMENT_12  FLAG_DOCUMENT_13  FLAG_DOCUMENT_14  \
SK_ID_CURR                                                         
344817                     0                 0                 0   
304895                     0                 0                 0   
274533                     0                 0                 0   
416053                     0                 0                 0   
328656                     0                 0                 0   
...                      ...               ...               ...   
314339                     0                 0                 0   
107600                     0                 0                 0   
342675                     0                 0                 0   
109967                     0                 0                 0   
326396                     0                 0                 0   

            FLAG_DOCUMENT_15  FLAG_DOCUMENT_16  FLAG_DOCUMENT_17  \
SK_ID_CURR                                                         
344817                     0                 0                 0   
304895                     0                 0                 0   
274533                     0                 0                 0   
416053                     0                 0                 0   
328656                     0                 0                 0   
...                      ...               ...               ...   
314339                     0                 0                 0   
107600                     0                 0                 0   
342675                     0                 0                 0   
109967                     0                 0                 0   
326396                     0                 0                 0   

            FLAG_DOCUMENT_18  FLAG_DOCUMENT_19  FLAG_DOCUMENT_20  \
SK_ID_CURR                                                         
344817                     0                 0                 0   
304895                     0                 0                 0   
274533                     0                 0                 0   
416053                     0                 0                 0   
328656                     0                 0                 0   
...                      ...               ...               ...   
314339                     0                 0                 0   
107600                     0                 0                 0   
342675                     0                 0                 0   
109967                     0                 0                 0   
326396                     0                 0                 0   

            FLAG_DOCUMENT_21  AMT_REQ_CREDIT_BUREAU_HOUR  \
SK_ID_CURR                                                 
344817                     0                         0.0   
304895                     0                         0.0   
274533                     0                         0.0   
416053                     0                         0.0   
328656                     0                         0.0   
...                      ...                         ...   
314339                     0                         0.0   
107600                     0                         0.0   
342675                     0                         0.0   
109967                     0                         0.0   
326396                     0                         0.0   

            AMT_REQ_CREDIT_BUREAU_DAY  AMT_REQ_CREDIT_BUREAU_WEEK  \
SK_ID_CURR                                                          
344817                            0.0                         0.0   
304895                            0.0                         0.0   
274533                            0.0                         0.0   
416053                            0.0                         0.0   
328656                            0.0                         0.0   
...                               ...                         ...   
314339                            0.0                         0.0   
107600                            0.0                         0.0   
342675                            0.0                         0.0   
109967                            0.0                         0.0   
326396                            0.0                         0.0   

            AMT_REQ_CREDIT_BUREAU_MON  AMT_REQ_CREDIT_BUREAU_QRT  \
SK_ID_CURR                                                         
344817                            0.0                        1.0   
304895                            0.0                        0.0   
274533                            0.0                        0.0   
416053                            1.0                        0.0   
328656                            0.0                        0.0   
...                               ...                        ...   
314339                            0.0                        0.0   
107600                            0.0                        0.0   
342675                            0.0                        1.0   
109967                            0.0                        0.0   
326396                            1.0                        1.0   

            AMT_REQ_CREDIT_BUREAU_YEAR  TARGET  
SK_ID_CURR                                      
344817                             4.0       0  
304895                             1.0       1  
274533                             2.0       0  
416053                             1.0       0  
328656                             1.0       0  
...                                ...     ...  
314339                             1.0       0  
107600                             1.0       0  
342675                             0.0       0  
109967                             0.0       1  
326396                             1.0       0  

[61503 rows x 121 columns]
In [56]:
pd_loan_train.isna().sum().sum()
Out[56]:
0
In [57]:
pd_loan_train.to_csv("../data/train_pd_data_preprocessing_missing_outlier.csv")
pd_loan_test.to_csv("../data/test_pd_data_preprocessing_missing_outlier.csv")